Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 18 additions & 3 deletions datasets/README.md
Original file line number Diff line number Diff line change
@@ -1,15 +1,30 @@
# Data Preparing

1. Access to the synapse multi-organ dataset:
1. Sign up in the [official Synapse website](https://www.synapse.org/#!Synapse:syn3193805/wiki/) and download the dataset. Convert them to numpy format, clip the images within [-125, 275], normalize each 3D image to [0, 1], and extract 2D slices from 3D volume for training cases while keeping the 3D volume in h5 format for testing cases.
1. Access to the synapse multi-organ dataset by signing up in the [official Synapse website](https://www.synapse.org/#!Synapse:syn3193805/wiki/) and download the Abdomen dataset.
This can also be done by using the `dowload_data.py` script in the `datasets` directory:
```
python download_data.py [Synapse username] [Synapse password] [SynapseID of the dataset] [directory where the file should be stored]
```
Using `--help` also displays the SynapseIds of the Abdomen dataset.
(You probably want to download either `Reg-Training-Testing` or `Reg-Training-Training`.)
2. Preprocess the data:
1. Use the `preprocess_data.py` script in the `datasets` directory.
It will clip the image data within the range [-125, 275], normalize it to [0, 1], extract 2D slices from the 3D and save it in the appropriate file formats.
```
python preprocess_data.py [Location of the unzipped abdomen dataset]
```
By default, the data in the target directory won't be overwritten unless the `--overwrite` parameter is passed.
For an overview of additional arguments use the `--help` option.
2. You can also send an Email directly to jienengchen01 AT gmail.com to request the preprocessed data for reproduction.
2. The directory structure of the whole project is as follows:

```bash
.
├── TransUNet
│   ├──datasets
│   │    └── dataset_*.py
│   │    ├── dataset_*.py
│   │    ├── download_data.py
│   │    └── preprocess_data.py
│   ├──train.py
│   ├──test.py
│   └──...
Expand Down
25 changes: 25 additions & 0 deletions datasets/download_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import argparse
import synapseclient


def main(args: argparse.Namespace):
syn = synapseclient.Synapse()
syn.login(args.username, args.password)

entity = syn.get(entity=args.entity, downloadLocation=args.download_dir)
print("File downloaded successfully")


if __name__ == '__main__':
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument("username", type=str, help="Username for your Synapse account.")
parser.add_argument("password", type=str, help="Password to the associated username.")
parser.add_argument("entity", type=str,
help="The SynapseId of the dataset entity. For the Abdomen dataset of the 'Multi-Atlas "
"Labeling Beyond the Cranial Vault' challenge, the ids are:\n"
"Abdomen: syn3553734\n"
"RawData: syn3379050\n"
"Reg-Training-Testing: syn3380218\n"
"Reg-Training-Training: syn3380229")
parser.add_argument("download_dir", type=str, help="The location where the file should be downloaded to.")
main(parser.parse_args())
90 changes: 90 additions & 0 deletions datasets/preprocess_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import argparse
from pathlib import Path
from typing import List

import h5py
import nibabel
import numpy
from tqdm import tqdm


def get_case_ids_from_list(dataset_list_path: Path) -> List[str]:
with open(dataset_list_path, "r") as f:
slices = f.readlines()
case_ids = sorted(list(set([s.split("_")[0][4:].rstrip() for s in slices])))
return case_ids


def get_case_ids_from_directory(directory: Path) -> List[str]:
return [f.stem for f in directory.iterdir()]


def main(args: argparse.Namespace):
image_dir = args.original_dataset_dir / "img"
if args.from_list_file is not None:
case_ids = get_case_ids_from_list(args.from_list_file)
else:
case_ids = get_case_ids_from_directory(image_dir)
print(f"Processing case ids: {case_ids}")

for case_id in tqdm(case_ids):
case_image_dir = image_dir / case_id
if not case_image_dir.exists():
print(f"Sub-directory {case_image_dir} doesn't seem to exist. Skipping")
continue

for image_path in tqdm(case_image_dir.iterdir(), desc="Processing case files", leave=False):
label_id = f"label{image_path.name[3:]}" # cuts "img" from the image filename and replaces it with "label"
label_path = args.original_dataset_dir / "label" / case_id / label_id
assert image_path.exists() and label_path.exists(), f"For id {case_id} either the image or label file " \
f"is missing"
image_data = nibabel.load(image_path).get_fdata()
label_data = nibabel.load(label_path).get_fdata()

clipped_image_data = numpy.clip(image_data, *args.clip)
normalised_image_data = (clipped_image_data - args.clip[0]) / (args.clip[1] - args.clip[0])

# Reorders data so that the channel/slice dimension is in front. Makes iteration easier when processing
# slices and when using the h5 file for testing
transposed_image_data = numpy.transpose(normalised_image_data, (2, 0, 1))
transposed_label_data = numpy.transpose(label_data, (2, 0, 1))

# Extracting slices for training
for i, (image_slice, label_slice) in tqdm(enumerate(zip(transposed_image_data, transposed_label_data)),
desc="Processing slices", leave=False):
out_filename = args.target_dataset_dir / f"Synapse/train_npz/case{case_id}_slice{i:03d}.npz"

if not args.overwrite and out_filename.exists(): # Do not overwrite data unless flag is set
continue
if not out_filename.parent.exists():
out_filename.parent.mkdir(exist_ok=True, parents=True)
numpy.savez(out_filename, image=image_slice, label=label_slice)

# keep the 3D volume in h5 format for testing cases.
h5_filename = args.target_dataset_dir / f"Synapse/test_vol_h5/case{case_id}.npy.h5"
if not args.overwrite and h5_filename.exists(): # Do not overwrite data unless flag is set
continue
if not h5_filename.parent.exists():
h5_filename.parent.mkdir(exist_ok=True, parents=True)
with h5py.File(h5_filename, "w") as f:
f.create_dataset("image", data=transposed_image_data)
f.create_dataset("label", data=transposed_label_data)


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("original_dataset_dir", type=Path,
help="The root directory for the downloaded, original dataset")
parser.add_argument("-td", "--target-dataset-dir", type=Path, default=Path("../../data"),
help="The directory where the processed dataset should be stored.")
parser.add_argument("-fl", "--from-list-file", type=Path,
help="Do not process all directories that are contained in the original dataset directory, "
"but use those contained in the passed list file. The data in the list must be "
"structured as in the train.txt file located in lists/lists_Synapse.")
parser.add_argument("--clip", nargs=2, type=float, default=[-125, 275],
help="Two numbers [min max] that represent the interval that should be clipped from the "
"original image data.")
parser.add_argument("--overwrite", action="store_true", default=False,
help="Overwrite the data present in the target dataset directory")
parsed_args = parser.parse_args()
main(parsed_args)
6 changes: 4 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
torch==1.4.0
torchvision==0.5.0
torch>=1.4.0
torchvision>=0.5.0
numpy
tqdm
tensorboard
Expand All @@ -9,3 +9,5 @@ medpy
SimpleITK
scipy
h5py
nibabel
synapseclient