diff --git a/datasets/README.md b/datasets/README.md index c662f8e2..516f4b49 100644 --- a/datasets/README.md +++ b/datasets/README.md @@ -1,7 +1,20 @@ # Data Preparing -1. Access to the synapse multi-organ dataset: - 1. Sign up in the [official Synapse website](https://www.synapse.org/#!Synapse:syn3193805/wiki/) and download the dataset. Convert them to numpy format, clip the images within [-125, 275], normalize each 3D image to [0, 1], and extract 2D slices from 3D volume for training cases while keeping the 3D volume in h5 format for testing cases. +1. Access to the synapse multi-organ dataset by signing up in the [official Synapse website](https://www.synapse.org/#!Synapse:syn3193805/wiki/) and download the Abdomen dataset. + This can also be done by using the `dowload_data.py` script in the `datasets` directory: + ``` + python download_data.py [Synapse username] [Synapse password] [SynapseID of the dataset] [directory where the file should be stored] + ``` + Using `--help` also displays the SynapseIds of the Abdomen dataset. + (You probably want to download either `Reg-Training-Testing` or `Reg-Training-Training`.) +2. Preprocess the data: + 1. Use the `preprocess_data.py` script in the `datasets` directory. + It will clip the image data within the range [-125, 275], normalize it to [0, 1], extract 2D slices from the 3D and save it in the appropriate file formats. + ``` + python preprocess_data.py [Location of the unzipped abdomen dataset] + ``` + By default, the data in the target directory won't be overwritten unless the `--overwrite` parameter is passed. + For an overview of additional arguments use the `--help` option. 2. You can also send an Email directly to jienengchen01 AT gmail.com to request the preprocessed data for reproduction. 2. The directory structure of the whole project is as follows: @@ -9,7 +22,9 @@ . ├── TransUNet │   ├──datasets -│   │    └── dataset_*.py +│   │    ├── dataset_*.py +│   │    ├── download_data.py +│   │    └── preprocess_data.py │   ├──train.py │   ├──test.py │   └──... diff --git a/datasets/download_data.py b/datasets/download_data.py new file mode 100644 index 00000000..895bb32c --- /dev/null +++ b/datasets/download_data.py @@ -0,0 +1,25 @@ +import argparse +import synapseclient + + +def main(args: argparse.Namespace): + syn = synapseclient.Synapse() + syn.login(args.username, args.password) + + entity = syn.get(entity=args.entity, downloadLocation=args.download_dir) + print("File downloaded successfully") + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument("username", type=str, help="Username for your Synapse account.") + parser.add_argument("password", type=str, help="Password to the associated username.") + parser.add_argument("entity", type=str, + help="The SynapseId of the dataset entity. For the Abdomen dataset of the 'Multi-Atlas " + "Labeling Beyond the Cranial Vault' challenge, the ids are:\n" + "Abdomen: syn3553734\n" + "RawData: syn3379050\n" + "Reg-Training-Testing: syn3380218\n" + "Reg-Training-Training: syn3380229") + parser.add_argument("download_dir", type=str, help="The location where the file should be downloaded to.") + main(parser.parse_args()) diff --git a/datasets/preprocess_data.py b/datasets/preprocess_data.py new file mode 100644 index 00000000..ed03b6d9 --- /dev/null +++ b/datasets/preprocess_data.py @@ -0,0 +1,90 @@ +import argparse +from pathlib import Path +from typing import List + +import h5py +import nibabel +import numpy +from tqdm import tqdm + + +def get_case_ids_from_list(dataset_list_path: Path) -> List[str]: + with open(dataset_list_path, "r") as f: + slices = f.readlines() + case_ids = sorted(list(set([s.split("_")[0][4:].rstrip() for s in slices]))) + return case_ids + + +def get_case_ids_from_directory(directory: Path) -> List[str]: + return [f.stem for f in directory.iterdir()] + + +def main(args: argparse.Namespace): + image_dir = args.original_dataset_dir / "img" + if args.from_list_file is not None: + case_ids = get_case_ids_from_list(args.from_list_file) + else: + case_ids = get_case_ids_from_directory(image_dir) + print(f"Processing case ids: {case_ids}") + + for case_id in tqdm(case_ids): + case_image_dir = image_dir / case_id + if not case_image_dir.exists(): + print(f"Sub-directory {case_image_dir} doesn't seem to exist. Skipping") + continue + + for image_path in tqdm(case_image_dir.iterdir(), desc="Processing case files", leave=False): + label_id = f"label{image_path.name[3:]}" # cuts "img" from the image filename and replaces it with "label" + label_path = args.original_dataset_dir / "label" / case_id / label_id + assert image_path.exists() and label_path.exists(), f"For id {case_id} either the image or label file " \ + f"is missing" + image_data = nibabel.load(image_path).get_fdata() + label_data = nibabel.load(label_path).get_fdata() + + clipped_image_data = numpy.clip(image_data, *args.clip) + normalised_image_data = (clipped_image_data - args.clip[0]) / (args.clip[1] - args.clip[0]) + + # Reorders data so that the channel/slice dimension is in front. Makes iteration easier when processing + # slices and when using the h5 file for testing + transposed_image_data = numpy.transpose(normalised_image_data, (2, 0, 1)) + transposed_label_data = numpy.transpose(label_data, (2, 0, 1)) + + # Extracting slices for training + for i, (image_slice, label_slice) in tqdm(enumerate(zip(transposed_image_data, transposed_label_data)), + desc="Processing slices", leave=False): + out_filename = args.target_dataset_dir / f"Synapse/train_npz/case{case_id}_slice{i:03d}.npz" + + if not args.overwrite and out_filename.exists(): # Do not overwrite data unless flag is set + continue + if not out_filename.parent.exists(): + out_filename.parent.mkdir(exist_ok=True, parents=True) + numpy.savez(out_filename, image=image_slice, label=label_slice) + + # keep the 3D volume in h5 format for testing cases. + h5_filename = args.target_dataset_dir / f"Synapse/test_vol_h5/case{case_id}.npy.h5" + if not args.overwrite and h5_filename.exists(): # Do not overwrite data unless flag is set + continue + if not h5_filename.parent.exists(): + h5_filename.parent.mkdir(exist_ok=True, parents=True) + with h5py.File(h5_filename, "w") as f: + f.create_dataset("image", data=transposed_image_data) + f.create_dataset("label", data=transposed_label_data) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("original_dataset_dir", type=Path, + help="The root directory for the downloaded, original dataset") + parser.add_argument("-td", "--target-dataset-dir", type=Path, default=Path("../../data"), + help="The directory where the processed dataset should be stored.") + parser.add_argument("-fl", "--from-list-file", type=Path, + help="Do not process all directories that are contained in the original dataset directory, " + "but use those contained in the passed list file. The data in the list must be " + "structured as in the train.txt file located in lists/lists_Synapse.") + parser.add_argument("--clip", nargs=2, type=float, default=[-125, 275], + help="Two numbers [min max] that represent the interval that should be clipped from the " + "original image data.") + parser.add_argument("--overwrite", action="store_true", default=False, + help="Overwrite the data present in the target dataset directory") + parsed_args = parser.parse_args() + main(parsed_args) diff --git a/requirements.txt b/requirements.txt index 4abfe422..a9e046ff 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ -torch==1.4.0 -torchvision==0.5.0 +torch>=1.4.0 +torchvision>=0.5.0 numpy tqdm tensorboard @@ -9,3 +9,5 @@ medpy SimpleITK scipy h5py +nibabel +synapseclient