From 380be668ac0f7402c5dc48d553ad2bd3b988d8cb Mon Sep 17 00:00:00 2001 From: hendraet Date: Thu, 12 Aug 2021 16:42:53 +0200 Subject: [PATCH 1/7] Adds script to convert preprocess dataset --- datasets/preprocess_data.py | 100 ++++++++++++++++++++++++++++++++++++ requirements.txt | 5 +- 2 files changed, 103 insertions(+), 2 deletions(-) create mode 100644 datasets/preprocess_data.py diff --git a/datasets/preprocess_data.py b/datasets/preprocess_data.py new file mode 100644 index 00000000..238711f7 --- /dev/null +++ b/datasets/preprocess_data.py @@ -0,0 +1,100 @@ +from pathlib import Path + +import argparse +import h5py +import nibabel +import numpy + + +# TODO: how to download testdata? +def main(args: argparse.Namespace): + # Assuming filename are sth. like 'DET0000101_avg.nii' or 'DET0000101_avg_seg.nii' + filename_stems = set([file.stem.split('_')[0] for file in args.original_dataset_dir.iterdir()]) + + for filename_stem in filename_stems: + case_id = filename_stem[-4:] + + image_path = Path(args.original_dataset_dir / f'{filename_stem}_avg.nii') + label_path = Path(args.original_dataset_dir / f'{filename_stem}_avg_seg.nii') + assert image_path.exists() and label_path.exists(), f'For id {filename_stem} either the image or label file ' \ + f'is missing' + image_data = nibabel.load(image_path).get_fdata() + label_data = nibabel.load(label_path).get_fdata() + + normalised_image_data = image_data / 255 + + # Reorders data so that the channel dimension is at the front for easier indexing later + transposed_image_data = numpy.transpose(normalised_image_data, (2, 0, 1)) + transposed_label_data = numpy.transpose(label_data, (2, 0, 1)) + + # Extracting slices for training + for i, (image_slice, label_slice) in enumerate(zip(transposed_image_data, transposed_label_data)): + out_filename = args.target_dataset_dir / f'Synapse/train_npz/case{case_id}_slice{i:03d}.npz' + if not out_filename.parent.exists(): + out_filename.parent.mkdir(exist_ok=True, parents=True) + numpy.savez(out_filename, image=image_slice, label=label_slice) + + # keep the 3D volume in h5 format for testing cases. + # TODO: check if this is correct or if the testdata should be downloaded separately + h5_filename = args.target_dataset_dir / f'Synapse/test_vol_h5/case{case_id}.npy.h5' + if not h5_filename.parent.exists(): + h5_filename.parent.mkdir(exist_ok=True, parents=True) + with h5py.File(h5_filename, 'w') as f: + f.create_dataset('image', data=normalised_image_data) + f.create_dataset('label', data=label_data) + + # -------- + # cwd = '/content/drive/My Drive/TransUNet/Training-Testing' + # data_folder = '/content/drive/My Drive/TransUNet/data' + # subfolders = os.listdir(cwd + '/' + 'img') # subfolders will be like ['0062', '0064', ...] + # + # # I chose subfolder '0066', but maybe you will want to iterate & combine + # for subfolder in ['0066']: # subfolders[1:]: + # print(subfolder) + # tempwd = cwd + '/' + 'img' + '/' + subfolder + # files = os.listdir(tempwd) # files will be like ['img0032-0066.nii.gz', 'img0036-0066.nii.gz', ...] + # + # # iterate over filenames + # for filename in files: + # print(filename) + # righttext = filename[3:] # get the part 'xxxx-xxxx.nii.gz' + # subject = righttext[:4] + # img = nib.load(cwd + '/' + 'img' + '/' + subfolder + '/' + 'img' + righttext) + # label_data = nib.load(cwd + '/' + 'label' + '/' + subfolder + '/' + 'label' + righttext) + # + # # Convert them to numpy format, + # data = img.get_fdata() + # label_data = label_data.get_fdata() + # + # # clip the images within [-125, 275], + # data_clipped = np.clip(data, -125, 275) + # + # # normalize each 3D image to [0, 1], and + # data_normalised = (data_clipped - (-125)) / (275 - (-125)) + # + # # extract 2D slices from 3D volume for training cases while + # # e.g. slice 000 + # for i in range(data_normalised.shape[2]): + # formattedi = '{:03d}'.format(i) + # slice000 = data_normalised[:, :, i] + # label_slice000 = label_data[:, :, i] + # np.savez(data_folder + '/Synapse/train_npz/case' + subject + '_slice' + formattedi + '.npz', + # image=slice000, + # label=label_slice000) + # + # # keep the 3D volume in h5 format for testing cases. + # fn = data_folder + '/Synapse/test_vol_h5/case' + subject + '.npy.h5' + # f = h5py.File(fn, 'w') + # dset = f.create_dataset('image', data=data_normalised) + # dset = f.create_dataset('label', data=label_data) + # f.close() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('original_dataset_dir', type=Path, + help='The root directory for the downloaded, original dataset') + parser.add_argument('-td', '--target_dataset_dir', type=Path, default=Path('../../data'), + help='The directory where the processed dataset should be stored.') + parsed_args = parser.parse_args() + main(parsed_args) diff --git a/requirements.txt b/requirements.txt index 4abfe422..49b8739f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ -torch==1.4.0 -torchvision==0.5.0 +torch>=1.4.0 +torchvision>=0.5.0 numpy tqdm tensorboard @@ -9,3 +9,4 @@ medpy SimpleITK scipy h5py +nibabel From 5e4ff76c8488f9cf25d0e002b83da674b3ec0807 Mon Sep 17 00:00:00 2001 From: hendraet Date: Fri, 13 Aug 2021 15:50:55 +0200 Subject: [PATCH 2/7] Adds download script and draft for new preprocessing file --- datasets/download_data.py | 28 ++++++++ datasets/preprocess_data.py | 129 ++++++++++++++++++++++++++---------- 2 files changed, 122 insertions(+), 35 deletions(-) create mode 100644 datasets/download_data.py diff --git a/datasets/download_data.py b/datasets/download_data.py new file mode 100644 index 00000000..240384b3 --- /dev/null +++ b/datasets/download_data.py @@ -0,0 +1,28 @@ +import argparse +import synapseclient + + +def main(args: argparse.Namespace): + syn = synapseclient.Synapse() + syn.login(args.username, args.password) + + entity = syn.get(entity=args.entity, downloadLocation=args.download_dir) + print("File downloaded successfully") + + # filepath = entity.path + # print(f"File was saved to {filepath}") + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument("username", type=str, help="Username for your Synapse account.") + parser.add_argument("password", type=str, help="Password to the associated username.") + parser.add_argument("entity", type=str, + help="The SynapseId of the dataset entity. For the Abdomen dataset of the 'Multi-Atlas " + "Labeling Beyond the Cranial Vault' challenge, the ids are:\n" + "Abdomen: syn3553734\n" + "RawData: syn3379050\n" + "Reg-Training-Testing: syn3380218\n" + "Reg-Training-Training: syn3380229") + parser.add_argument("download_dir", type=str, help="The location where the file should be downloaded to.") + main(parser.parse_args()) diff --git a/datasets/preprocess_data.py b/datasets/preprocess_data.py index 238711f7..05413d63 100644 --- a/datasets/preprocess_data.py +++ b/datasets/preprocess_data.py @@ -6,42 +6,96 @@ import numpy +def id_to_color(id: float) -> numpy.ndarray: + class_to_color_map = { + "background": "#000000", + "dimgray": "#696969", + "lightgray": "#d3d3d3", + "forestgreen": "#228b22", + "darkred": "#8b0000", + "olive": "#808000", + "lightseagreen": "#20b2aa", + "darkblue": "#00008b", + "red": "#ff0000", + "darkorange": "#ff8c00", + "yellow": "#ffff00", + "lime": "#00ff00", + "royalblue": "#4169e1", + "deepskyblue": "#00bfff", + "blue": "#0000ff", + "fuchsia": "#ff00ff", + "palevioletred": "#db7093", + "khaki": "#f0e68c", + "deeppink": "#ff1493", + "lightsalmon": "#ffa07a", + "violet": "#ee82ee", + } + from PIL import ImageColor + return numpy.asarray(ImageColor.getrgb(list(class_to_color_map.values())[int(id)])) + + # TODO: how to download testdata? +# TODO: specify in README which exact dataset has to be downloaded +def get_case_ids_from_list(dataset_list_path: Path): + with open(dataset_list_path, "r") as f: + slices = f.readlines() + case_ids = sorted(list(set([s.split("_")[0][4:] for s in slices]))) + return case_ids + + def main(args: argparse.Namespace): - # Assuming filename are sth. like 'DET0000101_avg.nii' or 'DET0000101_avg_seg.nii' - filename_stems = set([file.stem.split('_')[0] for file in args.original_dataset_dir.iterdir()]) - - for filename_stem in filename_stems: - case_id = filename_stem[-4:] - - image_path = Path(args.original_dataset_dir / f'{filename_stem}_avg.nii') - label_path = Path(args.original_dataset_dir / f'{filename_stem}_avg_seg.nii') - assert image_path.exists() and label_path.exists(), f'For id {filename_stem} either the image or label file ' \ - f'is missing' - image_data = nibabel.load(image_path).get_fdata() - label_data = nibabel.load(label_path).get_fdata() - - normalised_image_data = image_data / 255 - - # Reorders data so that the channel dimension is at the front for easier indexing later - transposed_image_data = numpy.transpose(normalised_image_data, (2, 0, 1)) - transposed_label_data = numpy.transpose(label_data, (2, 0, 1)) - - # Extracting slices for training - for i, (image_slice, label_slice) in enumerate(zip(transposed_image_data, transposed_label_data)): - out_filename = args.target_dataset_dir / f'Synapse/train_npz/case{case_id}_slice{i:03d}.npz' - if not out_filename.parent.exists(): - out_filename.parent.mkdir(exist_ok=True, parents=True) - numpy.savez(out_filename, image=image_slice, label=label_slice) - - # keep the 3D volume in h5 format for testing cases. - # TODO: check if this is correct or if the testdata should be downloaded separately - h5_filename = args.target_dataset_dir / f'Synapse/test_vol_h5/case{case_id}.npy.h5' - if not h5_filename.parent.exists(): - h5_filename.parent.mkdir(exist_ok=True, parents=True) - with h5py.File(h5_filename, 'w') as f: - f.create_dataset('image', data=normalised_image_data) - f.create_dataset('label', data=label_data) + image_dir = args.original_dataset_dir / 'img' + case_ids = get_case_ids_from_list(args.list_path) + case_ids = ["0001"] # TODO + for case_id in case_ids: + case_image_dir = image_dir / case_id + if not case_image_dir.exists(): + print(f"Sub-directory {case_image_dir} doesn't seem to exist. Skipping") + continue + + for image_path in case_image_dir.iterdir(): + label_id = f"label{image_path.name[3:]}" # cuts "img" from the image filename and replaces it with "label" + label_path = args.original_dataset_dir / "label" / case_id / label_id + assert image_path.exists() and label_path.exists(), f'For id {case_id} either the image or label file ' \ + f'is missing' + image_data = nibabel.load(image_path).get_fdata() + label_data = nibabel.load(label_path).get_fdata() + + clipped_image_data = numpy.clip(image_data, *args.clip) + normalised_image_data = (clipped_image_data - args.clip[0]) / (args.clip[1] - args.clip[0]) + + # Reorders data so that the channel dimension is at the front for easier indexing later + transposed_image_data = numpy.transpose(normalised_image_data, (2, 0, 1)) + transposed_label_data = numpy.transpose(label_data, (2, 0, 1)) + + # Extracting slices for training + for i, (image_slice, label_slice) in enumerate(zip(transposed_image_data, transposed_label_data)): + out_filename = args.target_dataset_dir / f'Synapse/train_npz/case{case_id}_slice{i:03d}.npz' + + # TODO: remove + tmp_image = numpy.repeat(numpy.expand_dims(image_slice, axis=2), 3, axis=2) + tmp_label = numpy.zeros((*label_slice.shape, 3), dtype=numpy.uint8) + # for id in numpy.unique(label_data): + # if id == 0.0: + # continue + # mask = numpy.where(label_slice == id) + # tmp_label[mask] = id_to_color(id) + combined_array = numpy.concatenate(((tmp_image * 255).astype(numpy.uint8), tmp_label), axis=1) + from PIL import Image + Image.fromarray(combined_array, mode="RGB").show() + + if not out_filename.parent.exists(): + out_filename.parent.mkdir(exist_ok=True, parents=True) + numpy.savez(out_filename, image=image_slice, label=label_slice) + + # keep the 3D volume in h5 format for testing cases. + # TODO: check if this is correct or if the testdata should be downloaded separately + h5_filename = args.target_dataset_dir / f'Synapse/test_vol_h5/case{case_id}.npy.h5' + if not h5_filename.parent.exists(): + h5_filename.parent.mkdir(exist_ok=True, parents=True) + with h5py.File(h5_filename, 'w') as f: + f.create_dataset('image', data=normalised_image_data) + f.create_dataset('label', data=label_data) # -------- # cwd = '/content/drive/My Drive/TransUNet/Training-Testing' @@ -94,7 +148,12 @@ def main(args: argparse.Namespace): parser = argparse.ArgumentParser() parser.add_argument('original_dataset_dir', type=Path, help='The root directory for the downloaded, original dataset') - parser.add_argument('-td', '--target_dataset_dir', type=Path, default=Path('../../data'), + parser.add_argument('-td', '--target-dataset-dir', type=Path, default=Path('../../data'), help='The directory where the processed dataset should be stored.') + parser.add_argument('-lp', '--list-path', type=Path, default=Path('../lists/lists_Synapse/train.txt'), + help='Path to one of the dataset lists that contain the case ids that should be used.') + parser.add_argument('--clip', nargs=2, type=float, default=[-125, 275], + help='Two numbers [min max] that represent the interval that should be clipped from the ' + 'original image data.') parsed_args = parser.parse_args() main(parsed_args) From 16f8956f981d67a6e9b80bd22d59b24e42d944ae Mon Sep 17 00:00:00 2001 From: hendraet Date: Mon, 16 Aug 2021 15:00:54 +0200 Subject: [PATCH 3/7] Adapts scripts for data downloading and preprocessing --- datasets/README.md | 21 ++++- datasets/download_data.py | 3 - datasets/preprocess_data.py | 164 +++++++++++------------------------- 3 files changed, 66 insertions(+), 122 deletions(-) diff --git a/datasets/README.md b/datasets/README.md index c662f8e2..516f4b49 100644 --- a/datasets/README.md +++ b/datasets/README.md @@ -1,7 +1,20 @@ # Data Preparing -1. Access to the synapse multi-organ dataset: - 1. Sign up in the [official Synapse website](https://www.synapse.org/#!Synapse:syn3193805/wiki/) and download the dataset. Convert them to numpy format, clip the images within [-125, 275], normalize each 3D image to [0, 1], and extract 2D slices from 3D volume for training cases while keeping the 3D volume in h5 format for testing cases. +1. Access to the synapse multi-organ dataset by signing up in the [official Synapse website](https://www.synapse.org/#!Synapse:syn3193805/wiki/) and download the Abdomen dataset. + This can also be done by using the `dowload_data.py` script in the `datasets` directory: + ``` + python download_data.py [Synapse username] [Synapse password] [SynapseID of the dataset] [directory where the file should be stored] + ``` + Using `--help` also displays the SynapseIds of the Abdomen dataset. + (You probably want to download either `Reg-Training-Testing` or `Reg-Training-Training`.) +2. Preprocess the data: + 1. Use the `preprocess_data.py` script in the `datasets` directory. + It will clip the image data within the range [-125, 275], normalize it to [0, 1], extract 2D slices from the 3D and save it in the appropriate file formats. + ``` + python preprocess_data.py [Location of the unzipped abdomen dataset] + ``` + By default, the data in the target directory won't be overwritten unless the `--overwrite` parameter is passed. + For an overview of additional arguments use the `--help` option. 2. You can also send an Email directly to jienengchen01 AT gmail.com to request the preprocessed data for reproduction. 2. The directory structure of the whole project is as follows: @@ -9,7 +22,9 @@ . ├── TransUNet │   ├──datasets -│   │    └── dataset_*.py +│   │    ├── dataset_*.py +│   │    ├── download_data.py +│   │    └── preprocess_data.py │   ├──train.py │   ├──test.py │   └──... diff --git a/datasets/download_data.py b/datasets/download_data.py index 240384b3..895bb32c 100644 --- a/datasets/download_data.py +++ b/datasets/download_data.py @@ -9,9 +9,6 @@ def main(args: argparse.Namespace): entity = syn.get(entity=args.entity, downloadLocation=args.download_dir) print("File downloaded successfully") - # filepath = entity.path - # print(f"File was saved to {filepath}") - if __name__ == '__main__': parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) diff --git a/datasets/preprocess_data.py b/datasets/preprocess_data.py index 05413d63..1db74ccc 100644 --- a/datasets/preprocess_data.py +++ b/datasets/preprocess_data.py @@ -1,159 +1,91 @@ +import argparse from pathlib import Path +from typing import List -import argparse import h5py import nibabel import numpy +from tqdm import tqdm -def id_to_color(id: float) -> numpy.ndarray: - class_to_color_map = { - "background": "#000000", - "dimgray": "#696969", - "lightgray": "#d3d3d3", - "forestgreen": "#228b22", - "darkred": "#8b0000", - "olive": "#808000", - "lightseagreen": "#20b2aa", - "darkblue": "#00008b", - "red": "#ff0000", - "darkorange": "#ff8c00", - "yellow": "#ffff00", - "lime": "#00ff00", - "royalblue": "#4169e1", - "deepskyblue": "#00bfff", - "blue": "#0000ff", - "fuchsia": "#ff00ff", - "palevioletred": "#db7093", - "khaki": "#f0e68c", - "deeppink": "#ff1493", - "lightsalmon": "#ffa07a", - "violet": "#ee82ee", - } - from PIL import ImageColor - return numpy.asarray(ImageColor.getrgb(list(class_to_color_map.values())[int(id)])) - - -# TODO: how to download testdata? # TODO: specify in README which exact dataset has to be downloaded -def get_case_ids_from_list(dataset_list_path: Path): +def get_case_ids_from_list(dataset_list_path: Path) -> List[str]: with open(dataset_list_path, "r") as f: slices = f.readlines() case_ids = sorted(list(set([s.split("_")[0][4:] for s in slices]))) return case_ids +def get_case_ids_from_directory(directory: Path) -> List[str]: + return [f.stem for f in directory.iterdir()] + + def main(args: argparse.Namespace): - image_dir = args.original_dataset_dir / 'img' - case_ids = get_case_ids_from_list(args.list_path) - case_ids = ["0001"] # TODO - for case_id in case_ids: + image_dir = args.original_dataset_dir / "img" + if args.from_list_file is not None: + case_ids = get_case_ids_from_list(args.from_list_file) + else: + case_ids = get_case_ids_from_directory(image_dir) + print(f"Processing case ids: {case_ids}") + + for case_id in tqdm(case_ids): case_image_dir = image_dir / case_id if not case_image_dir.exists(): print(f"Sub-directory {case_image_dir} doesn't seem to exist. Skipping") continue - for image_path in case_image_dir.iterdir(): + for image_path in tqdm(case_image_dir.iterdir(), desc="Processing case files", leave=False): label_id = f"label{image_path.name[3:]}" # cuts "img" from the image filename and replaces it with "label" label_path = args.original_dataset_dir / "label" / case_id / label_id - assert image_path.exists() and label_path.exists(), f'For id {case_id} either the image or label file ' \ - f'is missing' + assert image_path.exists() and label_path.exists(), f"For id {case_id} either the image or label file " \ + f"is missing" image_data = nibabel.load(image_path).get_fdata() label_data = nibabel.load(label_path).get_fdata() clipped_image_data = numpy.clip(image_data, *args.clip) normalised_image_data = (clipped_image_data - args.clip[0]) / (args.clip[1] - args.clip[0]) - # Reorders data so that the channel dimension is at the front for easier indexing later + # Reorders data so that the channel dimension is at the front for easier iteration in the subsequent + # for-loop transposed_image_data = numpy.transpose(normalised_image_data, (2, 0, 1)) transposed_label_data = numpy.transpose(label_data, (2, 0, 1)) # Extracting slices for training - for i, (image_slice, label_slice) in enumerate(zip(transposed_image_data, transposed_label_data)): - out_filename = args.target_dataset_dir / f'Synapse/train_npz/case{case_id}_slice{i:03d}.npz' - - # TODO: remove - tmp_image = numpy.repeat(numpy.expand_dims(image_slice, axis=2), 3, axis=2) - tmp_label = numpy.zeros((*label_slice.shape, 3), dtype=numpy.uint8) - # for id in numpy.unique(label_data): - # if id == 0.0: - # continue - # mask = numpy.where(label_slice == id) - # tmp_label[mask] = id_to_color(id) - combined_array = numpy.concatenate(((tmp_image * 255).astype(numpy.uint8), tmp_label), axis=1) - from PIL import Image - Image.fromarray(combined_array, mode="RGB").show() + for i, (image_slice, label_slice) in tqdm(enumerate(zip(transposed_image_data, transposed_label_data)), + desc="Processing slices", leave=False): + out_filename = args.target_dataset_dir / f"Synapse/train_npz/case{case_id}_slice{i:03d}.npz" + if not args.overwrite and out_filename.exists(): # Do not overwrite data unless flag is set + continue if not out_filename.parent.exists(): out_filename.parent.mkdir(exist_ok=True, parents=True) numpy.savez(out_filename, image=image_slice, label=label_slice) # keep the 3D volume in h5 format for testing cases. - # TODO: check if this is correct or if the testdata should be downloaded separately - h5_filename = args.target_dataset_dir / f'Synapse/test_vol_h5/case{case_id}.npy.h5' + h5_filename = args.target_dataset_dir / f"Synapse/test_vol_h5/case{case_id}.npy.h5" + if not args.overwrite and h5_filename.exists(): # Do not overwrite data unless flag is set + continue if not h5_filename.parent.exists(): h5_filename.parent.mkdir(exist_ok=True, parents=True) - with h5py.File(h5_filename, 'w') as f: - f.create_dataset('image', data=normalised_image_data) - f.create_dataset('label', data=label_data) - - # -------- - # cwd = '/content/drive/My Drive/TransUNet/Training-Testing' - # data_folder = '/content/drive/My Drive/TransUNet/data' - # subfolders = os.listdir(cwd + '/' + 'img') # subfolders will be like ['0062', '0064', ...] - # - # # I chose subfolder '0066', but maybe you will want to iterate & combine - # for subfolder in ['0066']: # subfolders[1:]: - # print(subfolder) - # tempwd = cwd + '/' + 'img' + '/' + subfolder - # files = os.listdir(tempwd) # files will be like ['img0032-0066.nii.gz', 'img0036-0066.nii.gz', ...] - # - # # iterate over filenames - # for filename in files: - # print(filename) - # righttext = filename[3:] # get the part 'xxxx-xxxx.nii.gz' - # subject = righttext[:4] - # img = nib.load(cwd + '/' + 'img' + '/' + subfolder + '/' + 'img' + righttext) - # label_data = nib.load(cwd + '/' + 'label' + '/' + subfolder + '/' + 'label' + righttext) - # - # # Convert them to numpy format, - # data = img.get_fdata() - # label_data = label_data.get_fdata() - # - # # clip the images within [-125, 275], - # data_clipped = np.clip(data, -125, 275) - # - # # normalize each 3D image to [0, 1], and - # data_normalised = (data_clipped - (-125)) / (275 - (-125)) - # - # # extract 2D slices from 3D volume for training cases while - # # e.g. slice 000 - # for i in range(data_normalised.shape[2]): - # formattedi = '{:03d}'.format(i) - # slice000 = data_normalised[:, :, i] - # label_slice000 = label_data[:, :, i] - # np.savez(data_folder + '/Synapse/train_npz/case' + subject + '_slice' + formattedi + '.npz', - # image=slice000, - # label=label_slice000) - # - # # keep the 3D volume in h5 format for testing cases. - # fn = data_folder + '/Synapse/test_vol_h5/case' + subject + '.npy.h5' - # f = h5py.File(fn, 'w') - # dset = f.create_dataset('image', data=data_normalised) - # dset = f.create_dataset('label', data=label_data) - # f.close() - - -if __name__ == '__main__': + with h5py.File(h5_filename, "w") as f: + f.create_dataset("image", data=normalised_image_data) + f.create_dataset("label", data=label_data) + + +if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('original_dataset_dir', type=Path, - help='The root directory for the downloaded, original dataset') - parser.add_argument('-td', '--target-dataset-dir', type=Path, default=Path('../../data'), - help='The directory where the processed dataset should be stored.') - parser.add_argument('-lp', '--list-path', type=Path, default=Path('../lists/lists_Synapse/train.txt'), - help='Path to one of the dataset lists that contain the case ids that should be used.') - parser.add_argument('--clip', nargs=2, type=float, default=[-125, 275], - help='Two numbers [min max] that represent the interval that should be clipped from the ' - 'original image data.') + parser.add_argument("original_dataset_dir", type=Path, + help="The root directory for the downloaded, original dataset") + parser.add_argument("-td", "--target-dataset-dir", type=Path, default=Path("../../data"), + help="The directory where the processed dataset should be stored.") + parser.add_argument("-fl", "--from-list-file", type=Path, + help="Do not process all directories that are contained in the original dataset directory, " + "but use those contained in the passed list file. The data in the list must be " + "structured as in the train.txt file located in lists/lists_Synapse.") + parser.add_argument("--clip", nargs=2, type=float, default=[-125, 275], + help="Two numbers [min max] that represent the interval that should be clipped from the " + "original image data.") + parser.add_argument("--overwrite", action="store_true", default=False, + help="Overwrite the data present in the target dataset directory") parsed_args = parser.parse_args() main(parsed_args) From 19b7a6110890a7f9e661b1790d4d18cb190d8ee5 Mon Sep 17 00:00:00 2001 From: hendraet Date: Wed, 18 Aug 2021 09:48:02 +0200 Subject: [PATCH 4/7] Minor fix --- datasets/preprocess_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/preprocess_data.py b/datasets/preprocess_data.py index 1db74ccc..ddc000d2 100644 --- a/datasets/preprocess_data.py +++ b/datasets/preprocess_data.py @@ -12,7 +12,7 @@ def get_case_ids_from_list(dataset_list_path: Path) -> List[str]: with open(dataset_list_path, "r") as f: slices = f.readlines() - case_ids = sorted(list(set([s.split("_")[0][4:] for s in slices]))) + case_ids = sorted(list(set([s.split("_")[0][4:].rstrip() for s in slices]))) return case_ids From 9556e79ab8674e95935dc8b907ae839236323bc3 Mon Sep 17 00:00:00 2001 From: hendraet Date: Thu, 19 Aug 2021 15:36:12 +0200 Subject: [PATCH 5/7] Adds synapsclient dependency to requirements --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 49b8739f..a9e046ff 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,3 +10,4 @@ SimpleITK scipy h5py nibabel +synapseclient From a58563efa7c99ad84172f1cc668552d483a8b8b1 Mon Sep 17 00:00:00 2001 From: hendraet Date: Mon, 30 Aug 2021 15:02:49 +0200 Subject: [PATCH 6/7] Removes TODO --- datasets/preprocess_data.py | 1 - 1 file changed, 1 deletion(-) diff --git a/datasets/preprocess_data.py b/datasets/preprocess_data.py index ddc000d2..a6c5e645 100644 --- a/datasets/preprocess_data.py +++ b/datasets/preprocess_data.py @@ -8,7 +8,6 @@ from tqdm import tqdm -# TODO: specify in README which exact dataset has to be downloaded def get_case_ids_from_list(dataset_list_path: Path) -> List[str]: with open(dataset_list_path, "r") as f: slices = f.readlines() From 3fb1c04881bf9862a9e730ec61622596b5ce1395 Mon Sep 17 00:00:00 2001 From: hendraet Date: Fri, 3 Sep 2021 13:23:10 +0200 Subject: [PATCH 7/7] Fixes error in test data preprocessing --- datasets/preprocess_data.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/datasets/preprocess_data.py b/datasets/preprocess_data.py index a6c5e645..ed03b6d9 100644 --- a/datasets/preprocess_data.py +++ b/datasets/preprocess_data.py @@ -44,8 +44,8 @@ def main(args: argparse.Namespace): clipped_image_data = numpy.clip(image_data, *args.clip) normalised_image_data = (clipped_image_data - args.clip[0]) / (args.clip[1] - args.clip[0]) - # Reorders data so that the channel dimension is at the front for easier iteration in the subsequent - # for-loop + # Reorders data so that the channel/slice dimension is in front. Makes iteration easier when processing + # slices and when using the h5 file for testing transposed_image_data = numpy.transpose(normalised_image_data, (2, 0, 1)) transposed_label_data = numpy.transpose(label_data, (2, 0, 1)) @@ -53,9 +53,9 @@ def main(args: argparse.Namespace): for i, (image_slice, label_slice) in tqdm(enumerate(zip(transposed_image_data, transposed_label_data)), desc="Processing slices", leave=False): out_filename = args.target_dataset_dir / f"Synapse/train_npz/case{case_id}_slice{i:03d}.npz" + if not args.overwrite and out_filename.exists(): # Do not overwrite data unless flag is set continue - if not out_filename.parent.exists(): out_filename.parent.mkdir(exist_ok=True, parents=True) numpy.savez(out_filename, image=image_slice, label=label_slice) @@ -67,8 +67,8 @@ def main(args: argparse.Namespace): if not h5_filename.parent.exists(): h5_filename.parent.mkdir(exist_ok=True, parents=True) with h5py.File(h5_filename, "w") as f: - f.create_dataset("image", data=normalised_image_data) - f.create_dataset("label", data=label_data) + f.create_dataset("image", data=transposed_image_data) + f.create_dataset("label", data=transposed_label_data) if __name__ == "__main__":