From d0839279985ce07636e7f39ac3666f93b2c9e7b5 Mon Sep 17 00:00:00 2001 From: vincenttsai2015 Date: Tue, 18 Nov 2025 08:20:54 +0000 Subject: [PATCH 1/8] Update for smmoth execution --- README.md | 76 +++++++------------ configs/experiment/ego.yaml | 9 ++- configs/experiment/nx_graphs.yaml | 28 +++---- configs/experiment/planar.yaml | 8 +- configs/experiment/sbm.yaml | 9 ++- requirements.txt | 36 ++++----- sparse_diffusion/analysis/visualization.py | 8 +- sparse_diffusion/datasets/abstract_dataset.py | 10 ++- sparse_diffusion/datasets/guacamol_dataset.py | 16 ++-- sparse_diffusion/datasets/moses_dataset.py | 18 ++--- sparse_diffusion/datasets/protein_dataset.py | 13 ++-- sparse_diffusion/datasets/qm9_dataset.py | 15 ++-- .../datasets/spectre_dataset_pyg.py | 14 ++-- sparse_diffusion/diffusion/diffusion_utils.py | 12 ++- sparse_diffusion/diffusion/extra_features.py | 8 +- .../diffusion/extra_features_molecular.py | 7 +- sparse_diffusion/diffusion/noise_schedule.py | 9 ++- sparse_diffusion/diffusion/sample_edges.py | 7 +- .../diffusion/sample_edges_utils.py | 7 +- sparse_diffusion/diffusion_model_sparse.py | 16 ++-- sparse_diffusion/main.py | 3 +- sparse_diffusion/metrics/abstract_metrics.py | 7 +- sparse_diffusion/metrics/metrics_utils.py | 8 +- sparse_diffusion/metrics/molecular_metrics.py | 8 +- sparse_diffusion/metrics/sampling_metrics.py | 9 ++- sparse_diffusion/metrics/spectre_utils.py | 16 ++-- sparse_diffusion/metrics/train_metrics.py | 7 +- .../models/conv_transformer_model.py | 11 ++- sparse_diffusion/models/transconv_layer.py | 7 +- sparse_diffusion/models/transformer_model.py | 11 ++- 30 files changed, 247 insertions(+), 166 deletions(-) diff --git a/README.md b/README.md index 9f3828f..14f9293 100644 --- a/README.md +++ b/README.md @@ -1,50 +1,29 @@ -# `Sparse denoising diffusion for large graph generation` - -Official code for the paper, "Sparse Training of Discrete Diffusion Models for Graph Generation," available [here](https://arxiv.org/abs/2311.02142). - -Checkpoints to reproduce the results can be found at [this link](https://drive.switch.ch/index.php/s/1hHNVCb0ylbYPoQ). Please refer to the updated version of our paper on arXiv. - - -## Environment installation -This code was tested with PyTorch 2.0.1, cuda 11.8 and torch_geometrics 2.3.1 - - - Download anaconda/miniconda if needed - - Create a rdkit environment that directly contains rdkit: - - ```conda create -c conda-forge -n sparse rdkit=2023.03.2 python=3.9``` - - `conda activate sparse` - - Check that this line does not return an error: - - ``` python3 -c 'from rdkit import Chem' ``` - - Install graph-tool (https://graph-tool.skewed.de/): - - ```conda install -c conda-forge graph-tool=2.45``` - - Check that this line does not return an error: - - ```python3 -c 'import graph_tool as gt' ``` - - Install the nvcc drivers for your cuda version. For example: - - ```conda install -c "nvidia/label/cuda-11.8.0" cuda``` - - Install a corresponding version of pytorch, for example: - - ```pip3 install torch==2.0.1 --index-url https://download.pytorch.org/whl/cu118``` - - Install other packages using the requirement file: - - ```pip install -r requirements.txt``` - - Install mini-moses: - - ```pip install git+https://github.com/igor-krawczuk/mini-moses``` - - Run: - - ```pip install -e .``` - - - Navigate to the ./sparse_diffusion/analysis/orca directory and compile orca.cpp: - - ```g++ -O2 -std=c++11 -o orca orca.cpp``` - - -## Run the code - +# Sparse denoising diffusion for large graph generation +Forked from the official code for the paper, "Sparse Training of Discrete Diffusion Models for Graph Generation," available [here](https://arxiv.org/abs/2311.02142). +Checkpoints to reproduce the results can be found at [this link](https://drive.switch.ch/index.php/s/1hHNVCb0ylbYPoQ). +Please refer to the updated version [here](https://arxiv.org/abs/2311.02142). + +## Environment installation (Modified from README.md of [SparseDiff](https://github.com/vincenttsai2015/SparseDiff/blob/main/README.md)) +This code was tested with PyTorch 2.4.1, cuda 12.1 and torch_geometrics 2.4.0 +* Download anaconda/miniconda if needed +* Conda environment building: ```conda create -c conda-forge -n digress rdkit=2023.03.2 python=3.9``` +* Activate the environment: ```conda activate digress``` +* Install graph-tool: ```conda install -c conda-forge graph-tool=2.45``` +* Verify the installation: + * ```python3 -c 'from rdkit import Chem'``` + * ```python3 -c 'import graph_tool as gt'``` +* Install the nvcc drivers: ```conda install -c "nvidia/label/cuda-12.1.0" cuda``` +* Install Pytorch: ```(python -m) pip install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/cu121``` +* Install PyG related packages: ```(python -m) pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.4.0+cu121.html``` +* Install DGL (for SparseDiff): ```conda install -c dglteam/label/th24_cu121 dgl``` +* Please ensure the synchronization of the versions of *nvcc drivers, Pytorch, PyG, and DGL*! +* Install the rest packages: ```pip install -r requirements.txt``` +* Install mini-moses (optional): ```pip install git+https://github.com/igor-krawczuk/mini-moses``` +* Navigate to the directory ```./sparse_diffusion/analysis/orca``` and compile orca.cpp: ```g++ -O2 -std=c++11 -o orca orca.cpp``` + +## Main execution file usage +* Use config files in folder ```config/experiments```. +* Example command for execution: ```CUDA_VISIBLE_DEVICES=0 python main.py +experiments=ego.yaml``` - All code is currently launched through `python3 main.py`. Check hydra documentation (https://hydra.cc/) for overriding default parameters. - To run the debugging code: `python3 main.py +experiment=debug.yaml`. We advise to try to run the debug mode first before launching full experiments. @@ -64,8 +43,5 @@ This code was tested with PyTorch 2.0.1, cuda 11.8 and torch_geometrics 2.3.1 } ``` - - ## Troubleshooting - `PermissionError: [Errno 13] Permission denied: 'SparseDiff/sparse_diffusion/analysis/orca/orca'`: You probably did not compile orca. diff --git a/configs/experiment/ego.yaml b/configs/experiment/ego.yaml index 0695fbd..77d610e 100644 --- a/configs/experiment/ego.yaml +++ b/configs/experiment/ego.yaml @@ -13,8 +13,15 @@ general: final_model_samples_to_generate: 151 final_model_samples_to_save: 30 final_model_chains_to_save: 10 +dataset: + name: 'ego' + datadir: 'data/ego/' + random_subset: null + pin_memory: False + molecules: False + spectre: False train: - n_epochs: 100000 + n_epochs: 100 batch_size: 32 save_model: True num_workers: 0 diff --git a/configs/experiment/nx_graphs.yaml b/configs/experiment/nx_graphs.yaml index e6d7ca8..18c2cfa 100644 --- a/configs/experiment/nx_graphs.yaml +++ b/configs/experiment/nx_graphs.yaml @@ -1,18 +1,18 @@ # @package _global_ general: - # General settings for ggg benchmarks - check_val_every_n_epochs: 5 - sample_every_val: 2 - samples_to_generate: 16 # since these are benchmarking graphs, we can use a smaller number - samples_to_save: 16 - chains_to_save: 1 - log_every_steps: 50 - number_chain_steps: 50 # Number of frames in each gif + # General settings for ggg benchmarks + check_val_every_n_epochs: 5 + sample_every_val: 2 + samples_to_generate: 16 # since these are benchmarking graphs, we can use a smaller number + samples_to_save: 16 + chains_to_save: 1 + log_every_steps: 50 + number_chain_steps: 50 # Number of frames in each gif - final_model_samples_to_generate: 10000 - final_model_samples_to_save: 100 - final_model_chains_to_save: 50 - cpus_per_gpu: 4 - force_ray: false - val_bs_multiplier: 1.0 \ No newline at end of file + final_model_samples_to_generate: 100 + final_model_samples_to_save: 100 + final_model_chains_to_save: 50 + cpus_per_gpu: 4 + force_ray: false + val_bs_multiplier: 1.0 \ No newline at end of file diff --git a/configs/experiment/planar.yaml b/configs/experiment/planar.yaml index 0544b4e..9c5447a 100644 --- a/configs/experiment/planar.yaml +++ b/configs/experiment/planar.yaml @@ -12,8 +12,14 @@ general: final_model_samples_to_generate: 40 final_model_samples_to_save: 30 final_model_chains_to_save: 20 +dataset: + name: 'planar' + datadir: 'data/planar/' + remove_h: null + molecules: False + spectre: True train: - n_epochs: 300000 + n_epochs: 30 batch_size: 64 save_model: True model: diff --git a/configs/experiment/sbm.yaml b/configs/experiment/sbm.yaml index 0493f2f..88de06a 100644 --- a/configs/experiment/sbm.yaml +++ b/configs/experiment/sbm.yaml @@ -13,8 +13,15 @@ general: final_model_samples_to_generate: 40 final_model_samples_to_save: 30 final_model_chains_to_save: 20 +data: + name: 'sbm' + datadir: 'data/sbm' + remove_h: null + molecules: False + spectre: True + pin_memory: False train: - n_epochs: 200000 + n_epochs: 200 batch_size: 32 save_model: True num_workers: 0 diff --git a/requirements.txt b/requirements.txt index 356e059..d5ef7a0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,18 +1,18 @@ -dgl -hydra-core==1.3.2 -imageio==2.31.1 -matplotlib==3.7.1 -networkx==2.8.7 -numpy==1.23 -omegaconf==2.3.0 -overrides==7.3.1 -pandas==1.4 -pyemd==1.0.0 -PyGSP==0.5.1 -scipy==1.11.0 -pytorch_lightning==2.0.4 -setuptools==68.0.0 -torch_geometric==2.3.1 -torchmetrics==0.11.4 -tqdm==4.65.0 -wandb==0.15.4 \ No newline at end of file +hydra-core +imageio +matplotlib +networkx +numpy +omegaconf +overrides +pandas +pyemd +PyGSP +pytorch_lightning +scipy +setuptools +torchmetrics +tqdm +wandb +networkx-temporal +torch-geometric==2.4.0 \ No newline at end of file diff --git a/sparse_diffusion/analysis/visualization.py b/sparse_diffusion/analysis/visualization.py index d0ea0b9..21e6acf 100644 --- a/sparse_diffusion/analysis/visualization.py +++ b/sparse_diffusion/analysis/visualization.py @@ -1,4 +1,8 @@ -import os +import os, sys +import pathlib +import os.path as osp +RootPath = pathlib.Path(osp.realpath(__file__)).parents[1] +sys.path.append(f'{RootPath}') from rdkit import Chem from rdkit.Chem import Draw, AllChem @@ -10,7 +14,7 @@ import rdkit.Chem import wandb import matplotlib.pyplot as plt -from sparse_diffusion.metrics.molecular_metrics import Molecule, SparseMolecule +from metrics.molecular_metrics import Molecule, SparseMolecule class Visualizer: diff --git a/sparse_diffusion/datasets/abstract_dataset.py b/sparse_diffusion/datasets/abstract_dataset.py index 526a407..1d5b904 100644 --- a/sparse_diffusion/datasets/abstract_dataset.py +++ b/sparse_diffusion/datasets/abstract_dataset.py @@ -1,8 +1,12 @@ import abc import numpy as np - -from sparse_diffusion.diffusion.distributions import DistributionNodes -import sparse_diffusion.utils as utils +import os, sys +import pathlib +import os.path as osp +RootPath = pathlib.Path(osp.realpath(__file__)).parents[1] +sys.path.append(f'{RootPath}') +from diffusion.distributions import DistributionNodes +import utils import torch import torch.nn.functional as F from torch_geometric.data.lightning import LightningDataset diff --git a/sparse_diffusion/datasets/guacamol_dataset.py b/sparse_diffusion/datasets/guacamol_dataset.py index 52dae4e..4f0d7f5 100644 --- a/sparse_diffusion/datasets/guacamol_dataset.py +++ b/sparse_diffusion/datasets/guacamol_dataset.py @@ -1,6 +1,8 @@ -import os -import os.path as osp +import os, sys import pathlib +import os.path as osp +RootPath = pathlib.Path(osp.realpath(__file__)).parents[1] +sys.path.append(f'{RootPath}') import hashlib import numpy as np @@ -12,19 +14,19 @@ import torch.nn.functional as F from torch_geometric.data import InMemoryDataset, download_url -from sparse_diffusion.utils import PlaceHolder -from sparse_diffusion.datasets.abstract_dataset import ( +from utils import PlaceHolder +from datasets.abstract_dataset import ( MolecularDataModule, AbstractDatasetInfos, ) -from sparse_diffusion.datasets.dataset_utils import ( +from datasets.dataset_utils import ( save_pickle, mol_to_torch_geometric, load_pickle, Statistics, ) -from sparse_diffusion.metrics.molecular_metrics import SparseMolecule -from sparse_diffusion.metrics.metrics_utils import compute_all_statistics +from metrics.molecular_metrics import SparseMolecule +from metrics.metrics_utils import compute_all_statistics TRAIN_HASH = "05ad85d871958a05c02ab51a4fde8530" diff --git a/sparse_diffusion/datasets/moses_dataset.py b/sparse_diffusion/datasets/moses_dataset.py index a64012d..6d4ec74 100644 --- a/sparse_diffusion/datasets/moses_dataset.py +++ b/sparse_diffusion/datasets/moses_dataset.py @@ -1,8 +1,8 @@ -import os -import os.path as osp +import os, sys import pathlib - - +import os.path as osp +RootPath = pathlib.Path(osp.realpath(__file__)).parents[1] +sys.path.append(f'{RootPath}') import torch import torch.nn.functional as F from rdkit import Chem, RDLogger @@ -12,19 +12,19 @@ from torch_geometric.data import InMemoryDataset, download_url from hydra.utils import get_original_cwd -from sparse_diffusion.utils import PlaceHolder -from sparse_diffusion.datasets.abstract_dataset import ( +from utils import PlaceHolder +from datasets.abstract_dataset import ( MolecularDataModule, AbstractDatasetInfos, ) -from sparse_diffusion.datasets.dataset_utils import ( +from datasets.dataset_utils import ( save_pickle, mol_to_torch_geometric, load_pickle, Statistics, ) -from sparse_diffusion.metrics.molecular_metrics import SparseMolecule -from sparse_diffusion.metrics.metrics_utils import compute_all_statistics +from metrics.molecular_metrics import SparseMolecule +from metrics.metrics_utils import compute_all_statistics atom_encoder = {"C": 0, "N": 1, "S": 2, "O": 3, "F": 4, "Cl": 5, "Br": 6} diff --git a/sparse_diffusion/datasets/protein_dataset.py b/sparse_diffusion/datasets/protein_dataset.py index 9667a65..e53711d 100644 --- a/sparse_diffusion/datasets/protein_dataset.py +++ b/sparse_diffusion/datasets/protein_dataset.py @@ -1,6 +1,8 @@ -import os +import os, sys import pathlib import os.path as osp +RootPath = pathlib.Path(osp.realpath(__file__)).parents[1] +sys.path.append(f'{RootPath}') import numpy as np import torch.nn.functional as F @@ -11,26 +13,25 @@ from torch_geometric.data import InMemoryDataset, download_url from hydra.utils import get_original_cwd -from sparse_diffusion.utils import PlaceHolder -from sparse_diffusion.datasets.abstract_dataset import ( +from utils import PlaceHolder +from datasets.abstract_dataset import ( AbstractDataModule, AbstractDatasetInfos, ) -from sparse_diffusion.datasets.dataset_utils import ( +from datasets.dataset_utils import ( load_pickle, save_pickle, Statistics, to_list, RemoveYTransform, ) -from sparse_diffusion.metrics.metrics_utils import ( +from metrics.metrics_utils import ( node_counts, atom_type_counts, edge_counts, graph_counts, ) - class ProteinDataset(InMemoryDataset): ''' Implementation based on https://github.com/KarolisMart/SPECTRE/blob/main/data.py diff --git a/sparse_diffusion/datasets/qm9_dataset.py b/sparse_diffusion/datasets/qm9_dataset.py index c4ab24f..c580bfe 100644 --- a/sparse_diffusion/datasets/qm9_dataset.py +++ b/sparse_diffusion/datasets/qm9_dataset.py @@ -1,7 +1,8 @@ -import os -import os.path as osp +import os, sys import pathlib - +import os.path as osp +RootPath = pathlib.Path(osp.realpath(__file__)).parents[1] +sys.path.append(f'{RootPath}') import torch import torch.nn.functional as F @@ -12,12 +13,12 @@ from torch_geometric.data import InMemoryDataset, download_url, extract_zip from hydra.utils import get_original_cwd -from sparse_diffusion.utils import PlaceHolder -from sparse_diffusion.datasets.abstract_dataset import ( +from utils import PlaceHolder +from datasets.abstract_dataset import ( MolecularDataModule, AbstractDatasetInfos, ) -from sparse_diffusion.datasets.dataset_utils import ( +from datasets.dataset_utils import ( load_pickle, save_pickle, mol_to_torch_geometric, @@ -26,7 +27,7 @@ to_list, files_exist, ) -from sparse_diffusion.metrics.metrics_utils import compute_all_statistics +from metrics.metrics_utils import compute_all_statistics class RemoveYTransform: diff --git a/sparse_diffusion/datasets/spectre_dataset_pyg.py b/sparse_diffusion/datasets/spectre_dataset_pyg.py index b71e0a4..1f11107 100644 --- a/sparse_diffusion/datasets/spectre_dataset_pyg.py +++ b/sparse_diffusion/datasets/spectre_dataset_pyg.py @@ -1,6 +1,8 @@ -import os +import os, sys import pathlib import os.path as osp +RootPath = pathlib.Path(osp.realpath(__file__)).parents[1] +sys.path.append(f'{RootPath}') import numpy as np from tqdm import tqdm @@ -12,19 +14,19 @@ from hydra.utils import get_original_cwd from networkx import to_numpy_array -from sparse_diffusion.utils import PlaceHolder -from sparse_diffusion.datasets.abstract_dataset import ( +from utils import PlaceHolder +from datasets.abstract_dataset import ( AbstractDataModule, AbstractDatasetInfos, ) -from sparse_diffusion.datasets.dataset_utils import ( +from datasets.dataset_utils import ( load_pickle, save_pickle, Statistics, to_list, RemoveYTransform, ) -from sparse_diffusion.metrics.metrics_utils import ( +from metrics.metrics_utils import ( node_counts, atom_type_counts, edge_counts, @@ -172,7 +174,7 @@ def download(self): random_order = torch.randperm(adj.shape[-1]) adj = adj[random_order, :] adj = adj[:, random_order] - net = nx.from_numpy_matrix(adj.numpy()).to_undirected() + net = nx.from_numpy_array(adj.numpy()).to_undirected() if i in train_indices: train_data.append(adj) diff --git a/sparse_diffusion/diffusion/diffusion_utils.py b/sparse_diffusion/diffusion/diffusion_utils.py index e329b27..4b6cc19 100644 --- a/sparse_diffusion/diffusion/diffusion_utils.py +++ b/sparse_diffusion/diffusion/diffusion_utils.py @@ -2,10 +2,14 @@ from torch.nn import functional as F import numpy as np import math - -from sparse_diffusion.utils import PlaceHolder -from sparse_diffusion import utils -from sparse_diffusion.diffusion.sample_edges import sample_query_edges +import os, sys +import pathlib +import os.path as osp +RootPath = pathlib.Path(osp.realpath(__file__)).parents[1] +sys.path.append(f'{RootPath}') +import utils +from utils import PlaceHolder +from diffusion.sample_edges import sample_query_edges def sum_except_batch(x): diff --git a/sparse_diffusion/diffusion/extra_features.py b/sparse_diffusion/diffusion/extra_features.py index e26da07..c77710a 100644 --- a/sparse_diffusion/diffusion/extra_features.py +++ b/sparse_diffusion/diffusion/extra_features.py @@ -1,9 +1,13 @@ import time import math - +import os, sys +import pathlib +import os.path as osp +RootPath = pathlib.Path(osp.realpath(__file__)).parents[1] +sys.path.append(f'{RootPath}') import torch import torch.nn.functional as F -from sparse_diffusion import utils +import utils def batch_trace(X): diff --git a/sparse_diffusion/diffusion/extra_features_molecular.py b/sparse_diffusion/diffusion/extra_features_molecular.py index 86d9ed7..4717636 100644 --- a/sparse_diffusion/diffusion/extra_features_molecular.py +++ b/sparse_diffusion/diffusion/extra_features_molecular.py @@ -1,5 +1,10 @@ +import os, sys +import pathlib +import os.path as osp +RootPath = pathlib.Path(osp.realpath(__file__)).parents[1] +sys.path.append(f'{RootPath}') import torch -from sparse_diffusion import utils +import utils import torch_geometric.nn.pool as pool diff --git a/sparse_diffusion/diffusion/noise_schedule.py b/sparse_diffusion/diffusion/noise_schedule.py index 45aac26..2059dd0 100644 --- a/sparse_diffusion/diffusion/noise_schedule.py +++ b/sparse_diffusion/diffusion/noise_schedule.py @@ -1,7 +1,12 @@ import numpy as np import torch -from sparse_diffusion import utils -from sparse_diffusion.diffusion import diffusion_utils +import os, sys +import pathlib +import os.path as osp +RootPath = pathlib.Path(osp.realpath(__file__)).parents[1] +sys.path.append(f'{RootPath}') +import utils +from diffusion import diffusion_utils class PredefinedNoiseSchedule(torch.nn.Module): diff --git a/sparse_diffusion/diffusion/sample_edges.py b/sparse_diffusion/diffusion/sample_edges.py index ea1f81c..7a59191 100644 --- a/sparse_diffusion/diffusion/sample_edges.py +++ b/sparse_diffusion/diffusion/sample_edges.py @@ -8,8 +8,13 @@ import torch_geometric.nn.pool as pool # My files +import os, sys +import pathlib +import os.path as osp +RootPath = pathlib.Path(osp.realpath(__file__)).parents[1] +sys.path.append(f'{RootPath}') import utils -from sparse_diffusion.diffusion.sample_edges_utils import ( +from diffusion.sample_edges_utils import ( matrix_to_condensed_index, condensed_to_matrix_index_batch, condensed_to_matrix_index, diff --git a/sparse_diffusion/diffusion/sample_edges_utils.py b/sparse_diffusion/diffusion/sample_edges_utils.py index be3dfeb..547e1c8 100644 --- a/sparse_diffusion/diffusion/sample_edges_utils.py +++ b/sparse_diffusion/diffusion/sample_edges_utils.py @@ -1,10 +1,13 @@ import torch import torch.nn.functional as F from torch_geometric.utils import coalesce - +import os, sys +import pathlib +import os.path as osp +RootPath = pathlib.Path(osp.realpath(__file__)).parents[1] +sys.path.append(f'{RootPath}') import utils - def condensed_to_matrix_index(condensed_index, num_nodes): """From https://stackoverflow.com/questions/5323818/condensed-matrix-function-to-find-pairs. condensed_index: (E) diff --git a/sparse_diffusion/diffusion_model_sparse.py b/sparse_diffusion/diffusion_model_sparse.py index 49ee647..57a1885 100644 --- a/sparse_diffusion/diffusion_model_sparse.py +++ b/sparse_diffusion/diffusion_model_sparse.py @@ -1,5 +1,9 @@ import time -import os +import os, sys +import pathlib +import os.path as osp +RootPath = pathlib.Path(osp.realpath(__file__)).parents[1] +sys.path.append(f'{RootPath}') import math import pickle import json @@ -21,21 +25,21 @@ from metrics.train_metrics import TrainLossDiscrete from metrics.abstract_metrics import SumExceptBatchMetric, SumExceptBatchKL, NLL from analysis.visualization import Visualizer -from sparse_diffusion import utils -from sparse_diffusion.diffusion import diffusion_utils -from sparse_diffusion.diffusion.sample_edges_utils import ( +import utils +from diffusion import diffusion_utils +from diffusion.sample_edges_utils import ( get_computational_graph, mask_query_graph_from_comp_graph, sample_non_existing_edge_attr, condensed_to_matrix_index_batch, matrix_to_condensed_index_batch, ) -from sparse_diffusion.diffusion.sample_edges import ( +from diffusion.sample_edges import ( sample_query_edges, sample_non_existing_edges_batched, sampled_condensed_indices_uniformly, ) -from sparse_diffusion.models.sign_pos_encoder import SignNetNodeEncoder +from models.sign_pos_encoder import SignNetNodeEncoder class DiscreteDenoisingDiffusion(pl.LightningModule): diff --git a/sparse_diffusion/main.py b/sparse_diffusion/main.py index ac0f3a9..9ae97fa 100644 --- a/sparse_diffusion/main.py +++ b/sparse_diffusion/main.py @@ -1,5 +1,4 @@ import os - os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" import pathlib @@ -20,7 +19,7 @@ from metrics.molecular_metrics import TrainMolecularMetricsDiscrete from diffusion.extra_features import DummyExtraFeatures, ExtraFeatures from diffusion.extra_features_molecular import ExtraMolecularFeatures -from sparse_diffusion.metrics.sampling_metrics import SamplingMetrics +from metrics.sampling_metrics import SamplingMetrics # debug for multi-gpu import resource diff --git a/sparse_diffusion/metrics/abstract_metrics.py b/sparse_diffusion/metrics/abstract_metrics.py index 45672e4..196f5c3 100644 --- a/sparse_diffusion/metrics/abstract_metrics.py +++ b/sparse_diffusion/metrics/abstract_metrics.py @@ -2,7 +2,12 @@ from torch import Tensor from torch.nn import functional as F from torchmetrics import Metric, MeanSquaredError -from sparse_diffusion.utils import PlaceHolder +import os, sys +import pathlib +import os.path as osp +RootPath = pathlib.Path(osp.realpath(__file__)).parents[1] +sys.path.append(f'{RootPath}') +from utils import PlaceHolder class TrainAbstractMetricsDiscrete(torch.nn.Module): diff --git a/sparse_diffusion/metrics/metrics_utils.py b/sparse_diffusion/metrics/metrics_utils.py index d1bf05b..4794ccb 100644 --- a/sparse_diffusion/metrics/metrics_utils.py +++ b/sparse_diffusion/metrics/metrics_utils.py @@ -6,7 +6,13 @@ from tqdm import tqdm from torch_geometric.data import Data import torch.nn.functional as F -from sparse_diffusion.datasets.dataset_utils import Statistics + +import os, sys +import pathlib +import os.path as osp +RootPath = pathlib.Path(osp.realpath(__file__)).parents[1] +sys.path.append(f'{RootPath}') +from datasets.dataset_utils import Statistics def molecules_to_datalist(molecules): diff --git a/sparse_diffusion/metrics/molecular_metrics.py b/sparse_diffusion/metrics/molecular_metrics.py index 974b044..cf7eaa4 100644 --- a/sparse_diffusion/metrics/molecular_metrics.py +++ b/sparse_diffusion/metrics/molecular_metrics.py @@ -1,4 +1,8 @@ -import os +import os, sys +import pathlib +import os.path as osp +RootPath = pathlib.Path(osp.realpath(__file__)).parents[1] +sys.path.append(f'{RootPath}') from collections import Counter import pandas as pd @@ -27,7 +31,7 @@ from fcd_torch import FCD import utils -from sparse_diffusion.metrics.metrics_utils import ( +from metrics.metrics_utils import ( counter_to_tensor, wasserstein1d, total_variation1d, diff --git a/sparse_diffusion/metrics/sampling_metrics.py b/sparse_diffusion/metrics/sampling_metrics.py index 0e4963d..78eeac1 100644 --- a/sparse_diffusion/metrics/sampling_metrics.py +++ b/sparse_diffusion/metrics/sampling_metrics.py @@ -1,3 +1,8 @@ +import os, sys +import pathlib +import os.path as osp +RootPath = pathlib.Path(osp.realpath(__file__)).parents[1] +sys.path.append(f'{RootPath}') from collections import Counter import torch.nn as nn @@ -10,8 +15,8 @@ from torch_geometric.utils import to_scipy_sparse_matrix import torch_geometric as pyg -import sparse_diffusion.utils as utils -from sparse_diffusion.metrics.metrics_utils import ( +import utils +from metrics.metrics_utils import ( counter_to_tensor, wasserstein1d, total_variation1d, diff --git a/sparse_diffusion/metrics/spectre_utils.py b/sparse_diffusion/metrics/spectre_utils.py index 9dbc357..a19843a 100644 --- a/sparse_diffusion/metrics/spectre_utils.py +++ b/sparse_diffusion/metrics/spectre_utils.py @@ -6,7 +6,11 @@ ##Navigate to the ./util/orca directory and compile orca.cpp # g++ -O2 -std=c++11 -o orca orca.cpp -import os +import os, sys +import pathlib +import os.path as osp +RootPath = pathlib.Path(osp.realpath(__file__)).parents[1] +sys.path.append(f'{RootPath}') import copy import random @@ -30,8 +34,8 @@ from string import ascii_uppercase, digits from torch_geometric.utils import to_dense_adj, is_undirected, to_networkx, remove_self_loops -from sparse_diffusion.utils import SparsePlaceHolder -from sparse_diffusion.analysis.dist_helper import ( +from utils import SparsePlaceHolder +from analysis.dist_helper import ( compute_mmd, gaussian_emd, gaussian, @@ -39,15 +43,13 @@ gaussian_tv, disc, ) -from sparse_diffusion.metrics.neural_metrics import ( +from metrics.neural_metrics import ( FIDEvaluation, MMDEvaluation, load_feature_extractor ) - - -from sparse_diffusion.utils import SparsePlaceHolder +from utils import SparsePlaceHolder PRINT_TIME = False __all__ = [ diff --git a/sparse_diffusion/metrics/train_metrics.py b/sparse_diffusion/metrics/train_metrics.py index f4d129f..37f4c67 100644 --- a/sparse_diffusion/metrics/train_metrics.py +++ b/sparse_diffusion/metrics/train_metrics.py @@ -1,6 +1,11 @@ import torch.nn as nn import wandb -from sparse_diffusion.metrics.abstract_metrics import CrossEntropyMetric +import os, sys +import pathlib +import os.path as osp +RootPath = pathlib.Path(osp.realpath(__file__)).parents[1] +sys.path.append(f'{RootPath}') +from metrics.abstract_metrics import CrossEntropyMetric class TrainLossDiscrete(nn.Module): diff --git a/sparse_diffusion/models/conv_transformer_model.py b/sparse_diffusion/models/conv_transformer_model.py index 34a8191..baced89 100644 --- a/sparse_diffusion/models/conv_transformer_model.py +++ b/sparse_diffusion/models/conv_transformer_model.py @@ -1,3 +1,8 @@ +import os, sys +import pathlib +import os.path as osp +RootPath = pathlib.Path(osp.realpath(__file__)).parents[1] +sys.path.append(f'{RootPath}') import math from typing import Optional, Tuple, Union @@ -14,9 +19,9 @@ import torch_geometric.nn.pool as pool from torch_geometric.utils import softmax, sort_edge_index -from sparse_diffusion import utils -from sparse_diffusion.models.transconv_layer import TransformerConv -from sparse_diffusion.models.layers import SparseXtoy, SparseEtoy +import utils +from models.transconv_layer import TransformerConv +from models.layers import SparseXtoy, SparseEtoy class XEyTransformerLayer(nn.Module): diff --git a/sparse_diffusion/models/transconv_layer.py b/sparse_diffusion/models/transconv_layer.py index f46e1b1..4c6ec5f 100644 --- a/sparse_diffusion/models/transconv_layer.py +++ b/sparse_diffusion/models/transconv_layer.py @@ -1,3 +1,8 @@ +import os, sys +import pathlib +import os.path as osp +RootPath = pathlib.Path(osp.realpath(__file__)).parents[1] +sys.path.append(f'{RootPath}') import math from typing import Optional, Tuple, Union @@ -11,7 +16,7 @@ from torch_geometric.nn.dense.linear import Linear from torch_geometric.typing import Adj, OptTensor, Size from torch_geometric.utils import softmax -from sparse_diffusion.models.layers import SparseXtoy, SparseEtoy +from models.layers import SparseXtoy, SparseEtoy compress_alpha = True compress_edge_attr = True diff --git a/sparse_diffusion/models/transformer_model.py b/sparse_diffusion/models/transformer_model.py index 8d4eef9..4e97081 100644 --- a/sparse_diffusion/models/transformer_model.py +++ b/sparse_diffusion/models/transformer_model.py @@ -1,3 +1,8 @@ +import os, sys +import pathlib +import os.path as osp +RootPath = pathlib.Path(osp.realpath(__file__)).parents[1] +sys.path.append(f'{RootPath}') import math import torch @@ -9,9 +14,9 @@ from torch.nn.modules.dropout import Dropout from torch.nn.modules.normalization import LayerNorm -from sparse_diffusion import utils -from sparse_diffusion.diffusion import diffusion_utils -from sparse_diffusion.models.layers import Xtoy, Etoy, masked_softmax +import utils +from diffusion import diffusion_utils +from models.layers import Xtoy, Etoy, masked_softmax class XEyTransformerLayer(nn.Module): From 05b18962a3b3e085cd906a1de1146872904beb51 Mon Sep 17 00:00:00 2001 From: Aka2210 Date: Tue, 18 Nov 2025 16:56:24 +0800 Subject: [PATCH 2/8] fix: Make torch.load call compatible across versions without weights_only --- sparse_diffusion/datasets/qm9_dataset.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sparse_diffusion/datasets/qm9_dataset.py b/sparse_diffusion/datasets/qm9_dataset.py index c580bfe..e4fa0a7 100644 --- a/sparse_diffusion/datasets/qm9_dataset.py +++ b/sparse_diffusion/datasets/qm9_dataset.py @@ -1,3 +1,4 @@ +import inspect import os, sys import pathlib import os.path as osp @@ -84,7 +85,7 @@ def __init__( } super().__init__(root, transform, pre_transform, pre_filter) - self.data, self.slices = torch.load(self.processed_paths[0]) + self.data, self.slices = torch.load(self.processed_paths[0], **({"weights_only": False} if "weights_only" in inspect.signature(torch.load).parameters else {})) self.statistics = Statistics( num_nodes=load_pickle(self.processed_paths[1]), From a8ebbb2a9ba5d015c74569122503e9aa2e8f8374 Mon Sep 17 00:00:00 2001 From: vincenttsai2015 Date: Wed, 19 Nov 2025 07:04:49 +0000 Subject: [PATCH 3/8] Update config file --- configs/dataset/sbm.yaml | 2 +- configs/experiment/sbm.yaml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/configs/dataset/sbm.yaml b/configs/dataset/sbm.yaml index ae132f5..4d87b1c 100644 --- a/configs/dataset/sbm.yaml +++ b/configs/dataset/sbm.yaml @@ -1,5 +1,5 @@ name: 'sbm' -datadir: 'data/sbm' +datadir: 'data/sbm/' remove_h: null molecules: False spectre: True diff --git a/configs/experiment/sbm.yaml b/configs/experiment/sbm.yaml index 88de06a..5c649a9 100644 --- a/configs/experiment/sbm.yaml +++ b/configs/experiment/sbm.yaml @@ -13,9 +13,9 @@ general: final_model_samples_to_generate: 40 final_model_samples_to_save: 30 final_model_chains_to_save: 20 -data: +dataset: name: 'sbm' - datadir: 'data/sbm' + datadir: 'data/sbm/' remove_h: null molecules: False spectre: True From da1fc76705e62ce9e80f763bcec132a5e8003666 Mon Sep 17 00:00:00 2001 From: vincenttsai2015 Date: Wed, 19 Nov 2025 15:05:52 +0800 Subject: [PATCH 4/8] Update config file --- configs/dataset/sbm.yaml | 2 +- configs/experiment/sbm.yaml | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/configs/dataset/sbm.yaml b/configs/dataset/sbm.yaml index ae132f5..4d87b1c 100644 --- a/configs/dataset/sbm.yaml +++ b/configs/dataset/sbm.yaml @@ -1,5 +1,5 @@ name: 'sbm' -datadir: 'data/sbm' +datadir: 'data/sbm/' remove_h: null molecules: False spectre: True diff --git a/configs/experiment/sbm.yaml b/configs/experiment/sbm.yaml index 88de06a..789c65a 100644 --- a/configs/experiment/sbm.yaml +++ b/configs/experiment/sbm.yaml @@ -13,15 +13,15 @@ general: final_model_samples_to_generate: 40 final_model_samples_to_save: 30 final_model_chains_to_save: 20 -data: +dataset: name: 'sbm' - datadir: 'data/sbm' + datadir: 'data/sbm/' remove_h: null molecules: False spectre: True pin_memory: False train: - n_epochs: 200 + n_epochs: 20 batch_size: 32 save_model: True num_workers: 0 From 678e2ae930ea9151c36a2052723b33fa92b76ab2 Mon Sep 17 00:00:00 2001 From: vincenttsai2015 Date: Wed, 19 Nov 2025 15:20:29 +0800 Subject: [PATCH 5/8] Fix typo in README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 14f9293..7baf8de 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ This code was tested with PyTorch 2.4.1, cuda 12.1 and torch_geometrics 2.4.0 ## Main execution file usage * Use config files in folder ```config/experiments```. -* Example command for execution: ```CUDA_VISIBLE_DEVICES=0 python main.py +experiments=ego.yaml``` +* Example command for execution: ```CUDA_VISIBLE_DEVICES=0 python main.py +experiment=ego.yaml``` - All code is currently launched through `python3 main.py`. Check hydra documentation (https://hydra.cc/) for overriding default parameters. - To run the debugging code: `python3 main.py +experiment=debug.yaml`. We advise to try to run the debug mode first before launching full experiments. From a037daffbd0a49443e280eae451b8a71051fced4 Mon Sep 17 00:00:00 2001 From: vincenttsai2015 Date: Wed, 19 Nov 2025 07:22:04 +0000 Subject: [PATCH 6/8] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 7baf8de..f5d1a48 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ This code was tested with PyTorch 2.4.1, cuda 12.1 and torch_geometrics 2.4.0 * Please ensure the synchronization of the versions of *nvcc drivers, Pytorch, PyG, and DGL*! * Install the rest packages: ```pip install -r requirements.txt``` * Install mini-moses (optional): ```pip install git+https://github.com/igor-krawczuk/mini-moses``` -* Navigate to the directory ```./sparse_diffusion/analysis/orca``` and compile orca.cpp: ```g++ -O2 -std=c++11 -o orca orca.cpp``` +* Navigate to the directory ```./sparse_diffusion/analysis/orca``` and compile orca.cpp: ```g++ -O2 -std=c++11 -o orca orca.cpp``` ## Main execution file usage * Use config files in folder ```config/experiments```. From 77992a1473d3c898a9fcb00c13c6bbe865ad88d1 Mon Sep 17 00:00:00 2001 From: Aka2210 Date: Wed, 19 Nov 2025 23:25:54 +0800 Subject: [PATCH 7/8] ENH: Add pyg_real_dataset to enable wiki-vote and MulDyDiff graphs in SparseDiff (non-temporal, no layer support yet) --- configs/experiment/wiki-vote.yaml | 41 +++ sparse_diffusion/datasets/abstract_dataset.py | 2 +- sparse_diffusion/datasets/pyg_real_dataset.py | 301 ++++++++++++++++++ .../datasets/spectre_dataset_pyg.py | 3 +- sparse_diffusion/main.py | 7 + sparse_diffusion/metrics/metrics_utils.py | 3 +- 6 files changed, 354 insertions(+), 3 deletions(-) create mode 100644 configs/experiment/wiki-vote.yaml create mode 100644 sparse_diffusion/datasets/pyg_real_dataset.py diff --git a/configs/experiment/wiki-vote.yaml b/configs/experiment/wiki-vote.yaml new file mode 100644 index 0000000..28bf74b --- /dev/null +++ b/configs/experiment/wiki-vote.yaml @@ -0,0 +1,41 @@ +# @package _global_ +general: + name : 'wiki-vote' + num_bins: 27500 + gpus : 1 + wandb: 'online' + resume: null # If resume, path to ckpt file from outputs directory in main directory + test_only: null + check_val_every_n_epochs: 1000 + sample_every_val: 4 + samples_to_generate: 64 + samples_to_save: 9 + chains_to_save: 1 + final_model_samples_to_generate: 151 + final_model_samples_to_save: 30 + final_model_chains_to_save: 10 +dataset: + name: 'wiki-vote' + datadir: 'data/wiki-vote/' + random_subset: null + pin_memory: False + molecules: False + spectre: False +train: + n_epochs: 100 + batch_size: 32 + save_model: True + num_workers: 0 +model: + diffusion_steps: 1000 + n_layers: 8 + num_degree: 20 + lambda_train: [5, 0, 2] + extra_features: 'all' + edge_fraction: 0.1 + # Do not set hidden_mlp_E, dim_ffE too high, computing large tensors on the edges is costly + # At the moment (03/08), y contains quite little information + hidden_mlp_dims: { 'X': 128, 'E': 64, 'y': 128 } + # The dimensions should satisfy dx % n_head == 0 + hidden_dims: { 'dx': 256, 'de': 64, 'dy': 128, 'n_head': 8, 'dim_ffX': 256, 'dim_ffE': 64, 'dim_ffy': 256 } + pin_memory: False diff --git a/sparse_diffusion/datasets/abstract_dataset.py b/sparse_diffusion/datasets/abstract_dataset.py index 1d5b904..8e58730 100644 --- a/sparse_diffusion/datasets/abstract_dataset.py +++ b/sparse_diffusion/datasets/abstract_dataset.py @@ -30,7 +30,6 @@ def __init__(self, cfg, datasets): def dataset_stat(self): dataset = self.train_dataset + self.val_dataset + self.test_dataset - nodes = [] edges = [] sparsity = [] @@ -180,6 +179,7 @@ def complete_infos(self, statistics, node_types): def compute_input_dims(self, datamodule, extra_features, domain_features): data = next(iter(datamodule.train_dataloader())) + print(data) example_batch = self.to_one_hot(data) ex_dense, node_mask = utils.to_dense( example_batch.x, diff --git a/sparse_diffusion/datasets/pyg_real_dataset.py b/sparse_diffusion/datasets/pyg_real_dataset.py new file mode 100644 index 0000000..0a3d436 --- /dev/null +++ b/sparse_diffusion/datasets/pyg_real_dataset.py @@ -0,0 +1,301 @@ +import inspect +import os, sys, pathlib +import os.path as osp + +from sklearn.model_selection import train_test_split + +from sparse_diffusion.metrics.metrics_utils import atom_type_counts, edge_counts, node_counts +from sparse_diffusion.utils import PlaceHolder +RootPath = pathlib.Path(os.path.realpath(__file__)).parents[1] +sys.path.append(f'{RootPath}') +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +import networkx as nx +import networkx_temporal as tx +from itertools import combinations + +import torch +from torch_geometric.utils import from_networkx +from torch_geometric.data import InMemoryDataset +from hydra.utils import get_original_cwd +from datasets.dataset_utils import ( + RemoveYTransform, + load_pickle, + Statistics, + save_pickle, + to_list, +) + +from datasets.abstract_dataset import AbstractDataModule, AbstractDatasetInfos + +def attribute_label(interaction_dict, snapshot_list): + labeled_snapshots = [] + interaction_id = {b:i for i,b in enumerate(interaction_dict.keys())} + node_labels = torch.eye(2) + edge_labels = torch.eye(3) + cross_edge_labels = torch.eye(3) + for _, g in enumerate(snapshot_list): + # intra-layer + layers = {l: nx.Graph() for l in range(len(interaction_dict))} + for l in range(len(interaction_dict)): + for u in g.nodes(): + layers[l].add_node((u, l), x=torch.tensor(0, dtype=torch.long), active=0, nid=u) + for u, v in g.edges(): + layers[l].add_edge((u, l), (v, l), edge_attr=torch.tensor(1, dtype=torch.long)) + for u, v, d in g.edges(data=True): + interaction = d['interaction'] + l = interaction_id[interaction] + layers[l].add_edge((u, l), (v, l), edge_attr=torch.tensor(2, dtype=torch.long)) + layers[l].nodes[(u, l)]['x'] = torch.tensor(1, dtype=torch.long) + layers[l].nodes[(u, l)]['active'] = 1 + layers[l].nodes[(v, l)]['x'] = torch.tensor(1, dtype=torch.long) + layers[l].nodes[(v, l)]['active'] = 1 + # inter-layer + cross_layer_links = {(interaction_id[i1], interaction_id[i2]): nx.Graph() for i1, i2 in combinations(interaction_id.keys(),2)} + for u in g.nodes(): + for i1, i2 in combinations(interaction_id, 2): + l1, l2 = interaction_id[i1], interaction_id[i2] + cross_layer_links[(l1, l2)].add_node((u, l1), nid=u, layer=l1, x=torch.tensor(0, dtype=torch.long)) + cross_layer_links[(l1, l2)].add_node((u, l2), nid=u, layer=l2, x=torch.tensor(0, dtype=torch.long)) + cross_layer_links[(l1, l2)].add_edge((u, l1),(u, l2), edge_attr=torch.tensor(1, dtype=torch.long)) + if layers[l1].nodes[(u, l1)]['active'] == layers[l2].nodes[(u, l2)]['active']: + cross_layer_links[(l1, l2)].nodes[(u, l1)]['x'] = torch.tensor(1, dtype=torch.long) + cross_layer_links[(l1, l2)].nodes[(u, l2)]['x'] = torch.tensor(1, dtype=torch.long) + cross_layer_links[(l1, l2)].edges[(u, l1),(u, l2)]['edge_attr'] = torch.tensor(2, dtype=torch.long) + labeled_snapshots.append({'intra': layers, 'inter': cross_layer_links}) + return labeled_snapshots + +class RealDataset(InMemoryDataset): + def __init__(self, dataset_name, split, root, num_bins: int, seed=None, + transform=None, pre_transform=None, pre_filter=None): + self.dataset_name = dataset_name + + self.split = split + if self.split == "train": + self.file_idx = 0 + elif self.split == "val": + self.file_idx = 1 + else: + self.file_idx = 2 + + self.num_bins = num_bins + self.seed = seed + + super().__init__(root, transform, pre_transform, pre_filter) + self.data, self.slices = torch.load( + self.processed_paths[0], + **({"weights_only": False} if "weights_only" in inspect.signature(torch.load).parameters else {}) + ) + + self.statistics = Statistics( + num_nodes=load_pickle(self.processed_paths[1]), + node_types=torch.from_numpy(np.load(self.processed_paths[2])).float(), + bond_types=torch.from_numpy(np.load(self.processed_paths[3])).float(), + ) + + @property + def raw_file_names(self): + return ["train.csv", "val.csv", "test.csv", "actions.csv"] + + @property + def split_file_name(self): + return ["train.pt", "val.pt", "test.pt"] + + @property + def split_paths(self): + r"""The absolute filepaths that must be present in order to skip + splitting.""" + files = to_list(self.split_file_name) + return [osp.join(self.raw_dir, f) for f in files] + + @property + def processed_file_names(self): + if self.split == "train": + return [ + f"train.pt", + f"train_n.pickle", + f"train_node_types.npy", + f"train_bond_types.npy", + ] + elif self.split == "val": + return [ + f"val.pt", + f"val_n.pickle", + f"val_node_types.npy", + f"val_bond_types.npy", + ] + else: + return [ + f"test.pt", + f"test_n.pickle", + f"test_node_types.npy", + f"test_bond_types.npy", + ] + + def download(self): + df = pd.read_csv(self.raw_paths[-1]) + + train_df, temp_df = train_test_split(df, test_size=0.30, shuffle=True, random_state=42) + + val_df, test_df = train_test_split(temp_df, test_size=0.50, shuffle=True, random_state=42) + + # 儲存 + train_df.to_csv(self.raw_paths[0], index=False) + val_df.to_csv(self.raw_paths[1], index=False) + test_df.to_csv(self.raw_paths[2], index=False) + + def process(self): + print(f'Loading csv data = {self.split}, file_idx = {self.file_idx}...') + df = pd.read_csv(self.raw_paths[self.file_idx]) + df = df[['source', 'target', 'interaction', 'datetime']] + print('Building networkx graph') + G = nx.from_pandas_edgelist(df, 'source', 'target', edge_attr=['interaction','datetime']) + + interactions = df['interaction'].unique().tolist() + interaction_id = {name: i for i, name in enumerate(interactions)} + + print('Temporal sequence construction...') + TG = tx.from_static(G) + TG = TG.slice(bins=self.num_bins) + snapshot_list = TG.to_snapshots() + + # node_number_list = [len(G.nodes) for G in sequence_list] + # edge_number_list = [len(G.edges) for G in sequence_list] + # N_max = max(node_number_list) + # E_max = max(edge_number_list) + + print('Labeling snapshots...') + labeled = attribute_label(interaction_id, snapshot_list) + print(f'len(labeled_snapshots)={len(labeled)}') + + # flatten temporal graphs + print('Flatten the multi-layer snapshots...') + flatten_nx = [] + for s in labeled: + G_flat = nx.Graph() + + for l in s['intra']: + G_flat.add_nodes_from(s['intra'][l].nodes(data=True)) + G_flat.add_edges_from(s['intra'][l].edges(data=True)) + + for (l1, l2) in s['inter']: + for (u_node, v_node, attrs) in s['inter'][(l1, l2)].edges(data=True): + u, _layer1 = u_node + v, _layer2 = v_node + + if u == v: + continue + + G_flat.add_edge(u_node, v_node, **attrs) + + flatten_nx.append(G_flat) + + print("Flattened snapshots:", len(flatten_nx)) + + print('Relabeling snapshots...') + relabeled = [] + for Gf in flatten_nx: + mapping = {n: i for i, n in enumerate(Gf.nodes())} + relabeled.append(nx.relabel_nodes(Gf, mapping)) + print(f'len(relabeled_nx_snapshots)={len(relabeled)}') + + print('Converting to PyG format...') + data_list = [from_networkx(snapshot) for _, snapshot in enumerate(relabeled)] + + for _, snapshot in enumerate(data_list): + snapshot.y = torch.zeros(1,2) + print(f'len(flatten_pyg_snapshots)={len(data_list)}') + + num_nodes = node_counts(data_list) + node_types = atom_type_counts(data_list, num_classes=2) + bond_types = edge_counts(data_list, num_bond_types=3) + torch.save(self.collate(data_list), self.processed_paths[0]) + save_pickle(num_nodes, self.processed_paths[1]) + np.save(self.processed_paths[2], node_types) + np.save(self.processed_paths[3], bond_types) + +class RealGraphDataModule(AbstractDataModule): + def __init__(self, cfg): + self.cfg = cfg + self.datadir = cfg.dataset.datadir + self.dataset_name = cfg.dataset.name + base_path = pathlib.Path(get_original_cwd()).parents[0] + root_path = os.path.join(base_path, self.datadir) + pre_transform = RemoveYTransform() + + datasets = { + "train": RealDataset( + dataset_name=self.cfg.dataset.name, + pre_transform=pre_transform, + split="train", + root=root_path, + num_bins=cfg.general.num_bins + ), + "val": RealDataset( + dataset_name=self.cfg.dataset.name, + pre_transform=pre_transform, + split="val", + root=root_path, + num_bins=cfg.general.num_bins + ), + "test": RealDataset( + dataset_name=self.cfg.dataset.name, + pre_transform=pre_transform, + split="test", + root=root_path, + num_bins=cfg.general.num_bins + ), + } + + self.statistics = { + "train": datasets["train"].statistics, + "val": datasets["val"].statistics, + "test": datasets["test"].statistics, + } + + super().__init__(cfg, datasets) + super().prepare_dataloader() + self.inner = self.train_dataset + + def save_datasets(self): + """ + Save train/val/test datasets + """ + output_dir = os.path.join(self.root_path, "processed") + os.makedirs(output_dir, exist_ok=True) + + splits = { + "train": self.train_dataset, + "val": self.val_dataset, + "test": self.test_dataset + } + + for split, dataset in splits.items(): + path = os.path.join(output_dir, f"{split}_dataset.pt") + data_list = list(dataset) # Ensure it's serializable + torch.save(data_list, path) + print(f"Saved {split} dataset with {len(data_list)} graphs to: {path}") + +class RealDatasetInfos(AbstractDatasetInfos): + def __init__(self, datamodule): + self.is_molecular = False + self.spectre = False + self.use_charge = False + self.dataset_name = datamodule.dataset_name + self.node_types = datamodule.inner.statistics.node_types + self.bond_types = datamodule.inner.statistics.bond_types + super().complete_infos( + datamodule.statistics, len(datamodule.inner.statistics.node_types) + ) + self.input_dims = PlaceHolder( + X=len(self.node_types), E=len(self.bond_types), y=0, charge=0 + ) + self.output_dims = PlaceHolder( + X=len(self.node_types), E=len(self.bond_types), y=0, charge=0 + ) + self.statistics = { + 'train': datamodule.statistics['train'], + 'val': datamodule.statistics['val'], + 'test': datamodule.statistics['test'] + } \ No newline at end of file diff --git a/sparse_diffusion/datasets/spectre_dataset_pyg.py b/sparse_diffusion/datasets/spectre_dataset_pyg.py index 1f11107..0ee2a2c 100644 --- a/sparse_diffusion/datasets/spectre_dataset_pyg.py +++ b/sparse_diffusion/datasets/spectre_dataset_pyg.py @@ -1,3 +1,4 @@ +import inspect import os, sys import pathlib import os.path as osp @@ -57,7 +58,7 @@ def __init__( self.file_idx = 2 super().__init__(root, transform, pre_transform, pre_filter) - self.data, self.slices = torch.load(self.processed_paths[0]) + self.data, self.slices = torch.load(self.processed_paths[0], **({"weights_only": False} if "weights_only" in inspect.signature(torch.load).parameters else {})) self.statistics = Statistics( num_nodes=load_pickle(self.processed_paths[1]), diff --git a/sparse_diffusion/main.py b/sparse_diffusion/main.py index 9ae97fa..71b3042 100644 --- a/sparse_diffusion/main.py +++ b/sparse_diffusion/main.py @@ -106,6 +106,13 @@ def main(cfg: DictConfig): domain_features = DummyExtraFeatures() train_metrics = TrainMolecularMetricsDiscrete(dataset_infos) + elif dataset_config["name"] in ["wiki-vote", "my_random_data"]: + from datasets.pyg_real_dataset import (RealGraphDataModule, RealDatasetInfos) + datamodule = RealGraphDataModule(cfg) + dataset_infos = RealDatasetInfos(datamodule) + train_metrics = TrainAbstractMetricsDiscrete() + domain_features = DummyExtraFeatures() + dataloaders = datamodule.dataloaders else: raise NotImplementedError("Unknown dataset {}".format(cfg["dataset"])) diff --git a/sparse_diffusion/metrics/metrics_utils.py b/sparse_diffusion/metrics/metrics_utils.py index 4794ccb..51c03c6 100644 --- a/sparse_diffusion/metrics/metrics_utils.py +++ b/sparse_diffusion/metrics/metrics_utils.py @@ -98,9 +98,10 @@ def edge_counts(data_list, num_bond_types=5): for data in tqdm(data_list): total_pairs = data.num_nodes * (data.num_nodes - 1) - num_edges = data.edge_attr.shape[0] num_non_edges = total_pairs - num_edges + if(num_non_edges < 0): + print(data, data.edge_index) assert num_non_edges >= 0 if len(data.edge_attr.shape) == 1: From 2a6343e3b58f0f69c1ccd2db13c50fc22b97437b Mon Sep 17 00:00:00 2001 From: Tang Webber Date: Thu, 20 Nov 2025 16:28:14 +0000 Subject: [PATCH 8/8] Fix: initialize empty y in sample_batch to resolve dimension mismatch error --- sparse_diffusion/diffusion_model_sparse.py | 46 +++++++++++++++++++++- 1 file changed, 44 insertions(+), 2 deletions(-) diff --git a/sparse_diffusion/diffusion_model_sparse.py b/sparse_diffusion/diffusion_model_sparse.py index 57a1885..cbeb064 100644 --- a/sparse_diffusion/diffusion_model_sparse.py +++ b/sparse_diffusion/diffusion_model_sparse.py @@ -1213,7 +1213,41 @@ def sample_batch( sparse_sampled_data = diffusion_utils.sample_sparse_discrete_feature_noise( limit_dist=self.limit_dist, node_mask=node_mask ) - + # ================= START FIX ================= + # 問題修復:Sampling 時 y 為空,導致 dimension mismatch。 + # 邏輯:模型總輸入 y 維度 = Base_y + Time(1) + Extra_y + if sparse_sampled_data.y.shape[1] == 0: + # 1. 計算 Extra Features 的 y 維度 (通過跑一次 dummy data) + dummy_data = { + 'node_t': sparse_sampled_data.node[:1], + 'edge_index_t': torch.zeros((2, 0), device=self.device, dtype=torch.long), + 'edge_attr_t': torch.zeros((0, self.out_dims.E), device=self.device), + 'batch': torch.zeros(1, device=self.device, dtype=torch.long), + 'y_t': torch.zeros((1, 0), device=self.device), + 'charge_t': torch.zeros((1, 0), device=self.device), + } + + with torch.no_grad(): + try: + extra_feats = self.extra_features(dummy_data) + if isinstance(extra_feats, tuple): + extra_feats = extra_feats[0] + extra_y_dim = extra_feats.y.shape[-1] if hasattr(extra_feats, 'y') else 0 + except Exception as e: + print(f"[Warning] Failed to verify extra features dim dynamically: {e}") + # 如果動態計算失敗,根據你的 log 手動設定 (Total 41 - Time 1 - Base 2 = 38) + extra_y_dim = 38 + + # 2. 計算缺失的 Base Y 維度 + # self.in_dims.y 是模型定義的總輸入維度 (41) + # 減去 Time step (1) 和 Extra Features (extra_y_dim) + base_y_dim = self.in_dims.y - 1 - extra_y_dim + + # 3. 初始化正確維度的 y (全 0) + if base_y_dim > 0: + sparse_sampled_data.y = torch.zeros((batch_size, base_y_dim), device=self.device) + print(f"[DEBUG Fix] Initialized empty y to shape: {sparse_sampled_data.y.shape} (Base: {base_y_dim}, Extra: {extra_y_dim}, Time: 1)") + # ================= END FIX ================= assert number_chain_steps < self.T chain = utils.SparseChainPlaceHolder(keep_chain=keep_chain) @@ -1385,6 +1419,7 @@ def sample_p_zs_given_zt(self, s_float, t_float, data): Samples from zs ~ p(zs | zt). Only used during sampling. if last_step, return the graph prediction as well """ + print(f"[DEBUG sample_p_zs_given_zt] Input data.y shape: {data.y.shape}") node = data.node edge_index = data.edge_index edge_attr = data.edge_attr @@ -1462,7 +1497,8 @@ def sample_p_zs_given_zt(self, s_float, t_float, data): "t_int": (t_float * self.T).int(), "t_float": t_float, } - + print(f"[DEBUG] sparse_noisy_data['y_t'] shape: {sparse_noisy_data['y_t'].shape}") + for i in range(len_loop): if self.autoregressive and i != 0: sparse_noisy_data["edge_index_t"] = new_edge_index @@ -1692,6 +1728,12 @@ def compute_extra_data(self, sparse_noisy_data): "charge_t": sparse_noisy_data["charge_t"], } + # print(f"[DEBUG compute_extra_data] Input y_t: {sparse_noisy_data['y_t'].shape}") + # print(f"[DEBUG compute_extra_data] t_float: {t_float.shape}") + # print(f"[DEBUG compute_extra_data] extra_y: {extra_y.shape}") + # y = torch.hstack((sparse_noisy_data["y_t"], t_float, extra_y)).float() + # print(f"[DEBUG compute_extra_data] Final y: {y.shape}") + return extra_sparse_noisy_data def get_scaling_layers(self):