From 380be668ac0f7402c5dc48d553ad2bd3b988d8cb Mon Sep 17 00:00:00 2001
From: hendraet <hendrik.raetz@niederjahna.de>
Date: Thu, 12 Aug 2021 16:42:53 +0200
Subject: [PATCH 1/7] Adds script to convert preprocess dataset

---
 datasets/preprocess_data.py | 100 ++++++++++++++++++++++++++++++++++++
 requirements.txt            |   5 +-
 2 files changed, 103 insertions(+), 2 deletions(-)
 create mode 100644 datasets/preprocess_data.py

diff --git a/datasets/preprocess_data.py b/datasets/preprocess_data.py
new file mode 100644
index 00000000..238711f7
--- /dev/null
+++ b/datasets/preprocess_data.py
@@ -0,0 +1,100 @@
+from pathlib import Path
+
+import argparse
+import h5py
+import nibabel
+import numpy
+
+
+# TODO: how to download testdata?
+def main(args: argparse.Namespace):
+    # Assuming filename are sth. like 'DET0000101_avg.nii' or 'DET0000101_avg_seg.nii'
+    filename_stems = set([file.stem.split('_')[0] for file in args.original_dataset_dir.iterdir()])
+
+    for filename_stem in filename_stems:
+        case_id = filename_stem[-4:]
+
+        image_path = Path(args.original_dataset_dir / f'{filename_stem}_avg.nii')
+        label_path = Path(args.original_dataset_dir / f'{filename_stem}_avg_seg.nii')
+        assert image_path.exists() and label_path.exists(), f'For id {filename_stem} either the image or label file ' \
+                                                            f'is missing'
+        image_data = nibabel.load(image_path).get_fdata()
+        label_data = nibabel.load(label_path).get_fdata()
+
+        normalised_image_data = image_data / 255
+
+        # Reorders data so that the channel dimension is at the front for easier indexing later
+        transposed_image_data = numpy.transpose(normalised_image_data, (2, 0, 1))
+        transposed_label_data = numpy.transpose(label_data, (2, 0, 1))
+
+        # Extracting slices for training
+        for i, (image_slice, label_slice) in enumerate(zip(transposed_image_data, transposed_label_data)):
+            out_filename = args.target_dataset_dir / f'Synapse/train_npz/case{case_id}_slice{i:03d}.npz'
+            if not out_filename.parent.exists():
+                out_filename.parent.mkdir(exist_ok=True, parents=True)
+            numpy.savez(out_filename, image=image_slice, label=label_slice)
+
+        # keep the 3D volume in h5 format for testing cases.
+        # TODO: check if this is correct or if the testdata should be downloaded separately
+        h5_filename = args.target_dataset_dir / f'Synapse/test_vol_h5/case{case_id}.npy.h5'
+        if not h5_filename.parent.exists():
+            h5_filename.parent.mkdir(exist_ok=True, parents=True)
+        with h5py.File(h5_filename, 'w') as f:
+            f.create_dataset('image', data=normalised_image_data)
+            f.create_dataset('label', data=label_data)
+
+    # --------
+    # cwd = '/content/drive/My Drive/TransUNet/Training-Testing'
+    # data_folder = '/content/drive/My Drive/TransUNet/data'
+    # subfolders = os.listdir(cwd + '/' + 'img')  # subfolders will be like ['0062', '0064', ...]
+    #
+    # # I chose subfolder '0066', but maybe you will want to iterate & combine
+    # for subfolder in ['0066']:  # subfolders[1:]:
+    #     print(subfolder)
+    #     tempwd = cwd + '/' + 'img' + '/' + subfolder
+    #     files = os.listdir(tempwd)  # files will be like ['img0032-0066.nii.gz', 'img0036-0066.nii.gz', ...]
+    #
+    #     # iterate over filenames
+    #     for filename in files:
+    #         print(filename)
+    #         righttext = filename[3:]  # get the part 'xxxx-xxxx.nii.gz'
+    #         subject = righttext[:4]
+    #         img = nib.load(cwd + '/' + 'img' + '/' + subfolder + '/' + 'img' + righttext)
+    #         label_data = nib.load(cwd + '/' + 'label' + '/' + subfolder + '/' + 'label' + righttext)
+    #
+    #         # Convert them to numpy format,
+    #         data = img.get_fdata()
+    #         label_data = label_data.get_fdata()
+    #
+    #         # clip the images within [-125, 275],
+    #         data_clipped = np.clip(data, -125, 275)
+    #
+    #         # normalize each 3D image to [0, 1], and
+    #         data_normalised = (data_clipped - (-125)) / (275 - (-125))
+    #
+    #         # extract 2D slices from 3D volume for training cases while
+    #         # e.g. slice 000
+    #         for i in range(data_normalised.shape[2]):
+    #             formattedi = '{:03d}'.format(i)
+    #             slice000 = data_normalised[:, :, i]
+    #             label_slice000 = label_data[:, :, i]
+    #             np.savez(data_folder + '/Synapse/train_npz/case' + subject + '_slice' + formattedi + '.npz',
+    #                      image=slice000,
+    #                      label=label_slice000)
+    #
+    #         # keep the 3D volume in h5 format for testing cases.
+    #         fn = data_folder + '/Synapse/test_vol_h5/case' + subject + '.npy.h5'
+    #         f = h5py.File(fn, 'w')
+    #         dset = f.create_dataset('image', data=data_normalised)
+    #         dset = f.create_dataset('label', data=label_data)
+    #         f.close()
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('original_dataset_dir', type=Path,
+                        help='The root directory for the downloaded, original dataset')
+    parser.add_argument('-td', '--target_dataset_dir', type=Path, default=Path('../../data'),
+                        help='The directory where the processed dataset should be stored.')
+    parsed_args = parser.parse_args()
+    main(parsed_args)
diff --git a/requirements.txt b/requirements.txt
index 4abfe422..49b8739f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
-torch==1.4.0
-torchvision==0.5.0
+torch>=1.4.0
+torchvision>=0.5.0
 numpy
 tqdm
 tensorboard
@@ -9,3 +9,4 @@ medpy
 SimpleITK
 scipy
 h5py
+nibabel

From 5e4ff76c8488f9cf25d0e002b83da674b3ec0807 Mon Sep 17 00:00:00 2001
From: hendraet <hendrik.raetz@niederjahna.de>
Date: Fri, 13 Aug 2021 15:50:55 +0200
Subject: [PATCH 2/7] Adds download script and draft for new preprocessing file

---
 datasets/download_data.py   |  28 ++++++++
 datasets/preprocess_data.py | 129 ++++++++++++++++++++++++++----------
 2 files changed, 122 insertions(+), 35 deletions(-)
 create mode 100644 datasets/download_data.py

diff --git a/datasets/download_data.py b/datasets/download_data.py
new file mode 100644
index 00000000..240384b3
--- /dev/null
+++ b/datasets/download_data.py
@@ -0,0 +1,28 @@
+import argparse
+import synapseclient
+
+
+def main(args: argparse.Namespace):
+    syn = synapseclient.Synapse()
+    syn.login(args.username, args.password)
+
+    entity = syn.get(entity=args.entity, downloadLocation=args.download_dir)
+    print("File downloaded successfully")
+
+    # filepath = entity.path
+    # print(f"File was saved to {filepath}")
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
+    parser.add_argument("username", type=str, help="Username for your Synapse account.")
+    parser.add_argument("password", type=str, help="Password to the associated username.")
+    parser.add_argument("entity", type=str,
+                        help="The SynapseId of the dataset entity. For the Abdomen dataset of the 'Multi-Atlas "
+                             "Labeling Beyond the Cranial Vault' challenge, the ids are:\n"
+                             "Abdomen: syn3553734\n"
+                             "RawData: syn3379050\n"
+                             "Reg-Training-Testing: syn3380218\n"
+                             "Reg-Training-Training: syn3380229")
+    parser.add_argument("download_dir", type=str, help="The location where the file should be downloaded to.")
+    main(parser.parse_args())
diff --git a/datasets/preprocess_data.py b/datasets/preprocess_data.py
index 238711f7..05413d63 100644
--- a/datasets/preprocess_data.py
+++ b/datasets/preprocess_data.py
@@ -6,42 +6,96 @@
 import numpy
 
 
+def id_to_color(id: float) -> numpy.ndarray:
+    class_to_color_map = {
+        "background": "#000000",
+        "dimgray": "#696969",
+        "lightgray": "#d3d3d3",
+        "forestgreen": "#228b22",
+        "darkred": "#8b0000",
+        "olive": "#808000",
+        "lightseagreen": "#20b2aa",
+        "darkblue": "#00008b",
+        "red": "#ff0000",
+        "darkorange": "#ff8c00",
+        "yellow": "#ffff00",
+        "lime": "#00ff00",
+        "royalblue": "#4169e1",
+        "deepskyblue": "#00bfff",
+        "blue": "#0000ff",
+        "fuchsia": "#ff00ff",
+        "palevioletred": "#db7093",
+        "khaki": "#f0e68c",
+        "deeppink": "#ff1493",
+        "lightsalmon": "#ffa07a",
+        "violet": "#ee82ee",
+    }
+    from PIL import ImageColor
+    return numpy.asarray(ImageColor.getrgb(list(class_to_color_map.values())[int(id)]))
+
+
 # TODO: how to download testdata?
+# TODO: specify in README which exact dataset has to be downloaded
+def get_case_ids_from_list(dataset_list_path: Path):
+    with open(dataset_list_path, "r") as f:
+        slices = f.readlines()
+    case_ids = sorted(list(set([s.split("_")[0][4:] for s in slices])))
+    return case_ids
+
+
 def main(args: argparse.Namespace):
-    # Assuming filename are sth. like 'DET0000101_avg.nii' or 'DET0000101_avg_seg.nii'
-    filename_stems = set([file.stem.split('_')[0] for file in args.original_dataset_dir.iterdir()])
-
-    for filename_stem in filename_stems:
-        case_id = filename_stem[-4:]
-
-        image_path = Path(args.original_dataset_dir / f'{filename_stem}_avg.nii')
-        label_path = Path(args.original_dataset_dir / f'{filename_stem}_avg_seg.nii')
-        assert image_path.exists() and label_path.exists(), f'For id {filename_stem} either the image or label file ' \
-                                                            f'is missing'
-        image_data = nibabel.load(image_path).get_fdata()
-        label_data = nibabel.load(label_path).get_fdata()
-
-        normalised_image_data = image_data / 255
-
-        # Reorders data so that the channel dimension is at the front for easier indexing later
-        transposed_image_data = numpy.transpose(normalised_image_data, (2, 0, 1))
-        transposed_label_data = numpy.transpose(label_data, (2, 0, 1))
-
-        # Extracting slices for training
-        for i, (image_slice, label_slice) in enumerate(zip(transposed_image_data, transposed_label_data)):
-            out_filename = args.target_dataset_dir / f'Synapse/train_npz/case{case_id}_slice{i:03d}.npz'
-            if not out_filename.parent.exists():
-                out_filename.parent.mkdir(exist_ok=True, parents=True)
-            numpy.savez(out_filename, image=image_slice, label=label_slice)
-
-        # keep the 3D volume in h5 format for testing cases.
-        # TODO: check if this is correct or if the testdata should be downloaded separately
-        h5_filename = args.target_dataset_dir / f'Synapse/test_vol_h5/case{case_id}.npy.h5'
-        if not h5_filename.parent.exists():
-            h5_filename.parent.mkdir(exist_ok=True, parents=True)
-        with h5py.File(h5_filename, 'w') as f:
-            f.create_dataset('image', data=normalised_image_data)
-            f.create_dataset('label', data=label_data)
+    image_dir = args.original_dataset_dir / 'img'
+    case_ids = get_case_ids_from_list(args.list_path)
+    case_ids = ["0001"]  # TODO
+    for case_id in case_ids:
+        case_image_dir = image_dir / case_id
+        if not case_image_dir.exists():
+            print(f"Sub-directory {case_image_dir} doesn't seem to exist. Skipping")
+            continue
+
+        for image_path in case_image_dir.iterdir():
+            label_id = f"label{image_path.name[3:]}"  # cuts "img" from the image filename and replaces it with "label"
+            label_path = args.original_dataset_dir / "label" / case_id / label_id
+            assert image_path.exists() and label_path.exists(), f'For id {case_id} either the image or label file ' \
+                                                                f'is missing'
+            image_data = nibabel.load(image_path).get_fdata()
+            label_data = nibabel.load(label_path).get_fdata()
+
+            clipped_image_data = numpy.clip(image_data, *args.clip)
+            normalised_image_data = (clipped_image_data - args.clip[0]) / (args.clip[1] - args.clip[0])
+
+            # Reorders data so that the channel dimension is at the front for easier indexing later
+            transposed_image_data = numpy.transpose(normalised_image_data, (2, 0, 1))
+            transposed_label_data = numpy.transpose(label_data, (2, 0, 1))
+
+            # Extracting slices for training
+            for i, (image_slice, label_slice) in enumerate(zip(transposed_image_data, transposed_label_data)):
+                out_filename = args.target_dataset_dir / f'Synapse/train_npz/case{case_id}_slice{i:03d}.npz'
+
+                # TODO: remove
+                tmp_image = numpy.repeat(numpy.expand_dims(image_slice, axis=2), 3, axis=2)
+                tmp_label = numpy.zeros((*label_slice.shape, 3), dtype=numpy.uint8)
+                # for id in numpy.unique(label_data):
+                #     if id == 0.0:
+                #         continue
+                #     mask = numpy.where(label_slice == id)
+                #     tmp_label[mask] = id_to_color(id)
+                combined_array = numpy.concatenate(((tmp_image * 255).astype(numpy.uint8), tmp_label), axis=1)
+                from PIL import Image
+                Image.fromarray(combined_array, mode="RGB").show()
+
+                if not out_filename.parent.exists():
+                    out_filename.parent.mkdir(exist_ok=True, parents=True)
+                numpy.savez(out_filename, image=image_slice, label=label_slice)
+
+            # keep the 3D volume in h5 format for testing cases.
+            # TODO: check if this is correct or if the testdata should be downloaded separately
+            h5_filename = args.target_dataset_dir / f'Synapse/test_vol_h5/case{case_id}.npy.h5'
+            if not h5_filename.parent.exists():
+                h5_filename.parent.mkdir(exist_ok=True, parents=True)
+            with h5py.File(h5_filename, 'w') as f:
+                f.create_dataset('image', data=normalised_image_data)
+                f.create_dataset('label', data=label_data)
 
     # --------
     # cwd = '/content/drive/My Drive/TransUNet/Training-Testing'
@@ -94,7 +148,12 @@ def main(args: argparse.Namespace):
     parser = argparse.ArgumentParser()
     parser.add_argument('original_dataset_dir', type=Path,
                         help='The root directory for the downloaded, original dataset')
-    parser.add_argument('-td', '--target_dataset_dir', type=Path, default=Path('../../data'),
+    parser.add_argument('-td', '--target-dataset-dir', type=Path, default=Path('../../data'),
                         help='The directory where the processed dataset should be stored.')
+    parser.add_argument('-lp', '--list-path', type=Path, default=Path('../lists/lists_Synapse/train.txt'),
+                        help='Path to one of the dataset lists that contain the case ids that should be used.')
+    parser.add_argument('--clip', nargs=2, type=float, default=[-125, 275],
+                        help='Two numbers [min max] that represent the interval that should be clipped from the '
+                             'original image data.')
     parsed_args = parser.parse_args()
     main(parsed_args)

From 16f8956f981d67a6e9b80bd22d59b24e42d944ae Mon Sep 17 00:00:00 2001
From: hendraet <hendrik.raetz@niederjahna.de>
Date: Mon, 16 Aug 2021 15:00:54 +0200
Subject: [PATCH 3/7] Adapts scripts for data downloading and preprocessing

---
 datasets/README.md          |  21 ++++-
 datasets/download_data.py   |   3 -
 datasets/preprocess_data.py | 164 +++++++++++-------------------------
 3 files changed, 66 insertions(+), 122 deletions(-)

diff --git a/datasets/README.md b/datasets/README.md
index c662f8e2..516f4b49 100644
--- a/datasets/README.md
+++ b/datasets/README.md
@@ -1,7 +1,20 @@
 # Data Preparing
 
-1. Access to the synapse multi-organ dataset:
-   1. Sign up in the [official Synapse website](https://www.synapse.org/#!Synapse:syn3193805/wiki/) and download the dataset. Convert them to numpy format, clip the images within [-125, 275], normalize each 3D image to [0, 1], and extract 2D slices from 3D volume for training cases while keeping the 3D volume in h5 format for testing cases.
+1. Access to the synapse multi-organ dataset by signing up in the [official Synapse website](https://www.synapse.org/#!Synapse:syn3193805/wiki/) and download the Abdomen dataset.  
+   This can also be done by using the `dowload_data.py` script in the `datasets` directory: 
+   ```
+      python download_data.py [Synapse username] [Synapse password] [SynapseID of the dataset] [directory where the file should be stored]
+   ```
+   Using `--help` also displays the SynapseIds of the Abdomen dataset.
+   (You probably want to download either `Reg-Training-Testing` or `Reg-Training-Training`.)
+2. Preprocess the data:                  
+   1. Use the `preprocess_data.py` script in the `datasets` directory. 
+      It will clip the image data within the range [-125, 275], normalize it to [0, 1], extract 2D slices from the 3D and save it in the appropriate file formats.
+      ```
+         python preprocess_data.py [Location of the unzipped abdomen dataset]
+      ```
+      By default, the data in the target directory won't be overwritten unless the `--overwrite` parameter is passed.  
+      For an overview of additional arguments use the `--help` option.
    2.  You can also send an Email directly to jienengchen01 AT gmail.com to request the preprocessed data for reproduction.
 2. The directory structure of the whole project is as follows:
 
@@ -9,7 +22,9 @@
 .
 ├── TransUNet
 │   ├──datasets
-│   │       └── dataset_*.py
+│   │       ├── dataset_*.py
+│   │       ├── download_data.py
+│   │       └── preprocess_data.py
 │   ├──train.py
 │   ├──test.py
 │   └──...
diff --git a/datasets/download_data.py b/datasets/download_data.py
index 240384b3..895bb32c 100644
--- a/datasets/download_data.py
+++ b/datasets/download_data.py
@@ -9,9 +9,6 @@ def main(args: argparse.Namespace):
     entity = syn.get(entity=args.entity, downloadLocation=args.download_dir)
     print("File downloaded successfully")
 
-    # filepath = entity.path
-    # print(f"File was saved to {filepath}")
-
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
diff --git a/datasets/preprocess_data.py b/datasets/preprocess_data.py
index 05413d63..1db74ccc 100644
--- a/datasets/preprocess_data.py
+++ b/datasets/preprocess_data.py
@@ -1,159 +1,91 @@
+import argparse
 from pathlib import Path
+from typing import List
 
-import argparse
 import h5py
 import nibabel
 import numpy
+from tqdm import tqdm
 
 
-def id_to_color(id: float) -> numpy.ndarray:
-    class_to_color_map = {
-        "background": "#000000",
-        "dimgray": "#696969",
-        "lightgray": "#d3d3d3",
-        "forestgreen": "#228b22",
-        "darkred": "#8b0000",
-        "olive": "#808000",
-        "lightseagreen": "#20b2aa",
-        "darkblue": "#00008b",
-        "red": "#ff0000",
-        "darkorange": "#ff8c00",
-        "yellow": "#ffff00",
-        "lime": "#00ff00",
-        "royalblue": "#4169e1",
-        "deepskyblue": "#00bfff",
-        "blue": "#0000ff",
-        "fuchsia": "#ff00ff",
-        "palevioletred": "#db7093",
-        "khaki": "#f0e68c",
-        "deeppink": "#ff1493",
-        "lightsalmon": "#ffa07a",
-        "violet": "#ee82ee",
-    }
-    from PIL import ImageColor
-    return numpy.asarray(ImageColor.getrgb(list(class_to_color_map.values())[int(id)]))
-
-
-# TODO: how to download testdata?
 # TODO: specify in README which exact dataset has to be downloaded
-def get_case_ids_from_list(dataset_list_path: Path):
+def get_case_ids_from_list(dataset_list_path: Path) -> List[str]:
     with open(dataset_list_path, "r") as f:
         slices = f.readlines()
     case_ids = sorted(list(set([s.split("_")[0][4:] for s in slices])))
     return case_ids
 
 
+def get_case_ids_from_directory(directory: Path) -> List[str]:
+    return [f.stem for f in directory.iterdir()]
+
+
 def main(args: argparse.Namespace):
-    image_dir = args.original_dataset_dir / 'img'
-    case_ids = get_case_ids_from_list(args.list_path)
-    case_ids = ["0001"]  # TODO
-    for case_id in case_ids:
+    image_dir = args.original_dataset_dir / "img"
+    if args.from_list_file is not None:
+        case_ids = get_case_ids_from_list(args.from_list_file)
+    else:
+        case_ids = get_case_ids_from_directory(image_dir)
+    print(f"Processing case ids: {case_ids}")
+
+    for case_id in tqdm(case_ids):
         case_image_dir = image_dir / case_id
         if not case_image_dir.exists():
             print(f"Sub-directory {case_image_dir} doesn't seem to exist. Skipping")
             continue
 
-        for image_path in case_image_dir.iterdir():
+        for image_path in tqdm(case_image_dir.iterdir(), desc="Processing case files", leave=False):
             label_id = f"label{image_path.name[3:]}"  # cuts "img" from the image filename and replaces it with "label"
             label_path = args.original_dataset_dir / "label" / case_id / label_id
-            assert image_path.exists() and label_path.exists(), f'For id {case_id} either the image or label file ' \
-                                                                f'is missing'
+            assert image_path.exists() and label_path.exists(), f"For id {case_id} either the image or label file " \
+                                                                f"is missing"
             image_data = nibabel.load(image_path).get_fdata()
             label_data = nibabel.load(label_path).get_fdata()
 
             clipped_image_data = numpy.clip(image_data, *args.clip)
             normalised_image_data = (clipped_image_data - args.clip[0]) / (args.clip[1] - args.clip[0])
 
-            # Reorders data so that the channel dimension is at the front for easier indexing later
+            # Reorders data so that the channel dimension is at the front for easier iteration in the subsequent
+            # for-loop
             transposed_image_data = numpy.transpose(normalised_image_data, (2, 0, 1))
             transposed_label_data = numpy.transpose(label_data, (2, 0, 1))
 
             # Extracting slices for training
-            for i, (image_slice, label_slice) in enumerate(zip(transposed_image_data, transposed_label_data)):
-                out_filename = args.target_dataset_dir / f'Synapse/train_npz/case{case_id}_slice{i:03d}.npz'
-
-                # TODO: remove
-                tmp_image = numpy.repeat(numpy.expand_dims(image_slice, axis=2), 3, axis=2)
-                tmp_label = numpy.zeros((*label_slice.shape, 3), dtype=numpy.uint8)
-                # for id in numpy.unique(label_data):
-                #     if id == 0.0:
-                #         continue
-                #     mask = numpy.where(label_slice == id)
-                #     tmp_label[mask] = id_to_color(id)
-                combined_array = numpy.concatenate(((tmp_image * 255).astype(numpy.uint8), tmp_label), axis=1)
-                from PIL import Image
-                Image.fromarray(combined_array, mode="RGB").show()
+            for i, (image_slice, label_slice) in tqdm(enumerate(zip(transposed_image_data, transposed_label_data)),
+                                                      desc="Processing slices", leave=False):
+                out_filename = args.target_dataset_dir / f"Synapse/train_npz/case{case_id}_slice{i:03d}.npz"
+                if not args.overwrite and out_filename.exists():  # Do not overwrite data unless flag is set
+                    continue
 
                 if not out_filename.parent.exists():
                     out_filename.parent.mkdir(exist_ok=True, parents=True)
                 numpy.savez(out_filename, image=image_slice, label=label_slice)
 
             # keep the 3D volume in h5 format for testing cases.
-            # TODO: check if this is correct or if the testdata should be downloaded separately
-            h5_filename = args.target_dataset_dir / f'Synapse/test_vol_h5/case{case_id}.npy.h5'
+            h5_filename = args.target_dataset_dir / f"Synapse/test_vol_h5/case{case_id}.npy.h5"
+            if not args.overwrite and h5_filename.exists():  # Do not overwrite data unless flag is set
+                continue
             if not h5_filename.parent.exists():
                 h5_filename.parent.mkdir(exist_ok=True, parents=True)
-            with h5py.File(h5_filename, 'w') as f:
-                f.create_dataset('image', data=normalised_image_data)
-                f.create_dataset('label', data=label_data)
-
-    # --------
-    # cwd = '/content/drive/My Drive/TransUNet/Training-Testing'
-    # data_folder = '/content/drive/My Drive/TransUNet/data'
-    # subfolders = os.listdir(cwd + '/' + 'img')  # subfolders will be like ['0062', '0064', ...]
-    #
-    # # I chose subfolder '0066', but maybe you will want to iterate & combine
-    # for subfolder in ['0066']:  # subfolders[1:]:
-    #     print(subfolder)
-    #     tempwd = cwd + '/' + 'img' + '/' + subfolder
-    #     files = os.listdir(tempwd)  # files will be like ['img0032-0066.nii.gz', 'img0036-0066.nii.gz', ...]
-    #
-    #     # iterate over filenames
-    #     for filename in files:
-    #         print(filename)
-    #         righttext = filename[3:]  # get the part 'xxxx-xxxx.nii.gz'
-    #         subject = righttext[:4]
-    #         img = nib.load(cwd + '/' + 'img' + '/' + subfolder + '/' + 'img' + righttext)
-    #         label_data = nib.load(cwd + '/' + 'label' + '/' + subfolder + '/' + 'label' + righttext)
-    #
-    #         # Convert them to numpy format,
-    #         data = img.get_fdata()
-    #         label_data = label_data.get_fdata()
-    #
-    #         # clip the images within [-125, 275],
-    #         data_clipped = np.clip(data, -125, 275)
-    #
-    #         # normalize each 3D image to [0, 1], and
-    #         data_normalised = (data_clipped - (-125)) / (275 - (-125))
-    #
-    #         # extract 2D slices from 3D volume for training cases while
-    #         # e.g. slice 000
-    #         for i in range(data_normalised.shape[2]):
-    #             formattedi = '{:03d}'.format(i)
-    #             slice000 = data_normalised[:, :, i]
-    #             label_slice000 = label_data[:, :, i]
-    #             np.savez(data_folder + '/Synapse/train_npz/case' + subject + '_slice' + formattedi + '.npz',
-    #                      image=slice000,
-    #                      label=label_slice000)
-    #
-    #         # keep the 3D volume in h5 format for testing cases.
-    #         fn = data_folder + '/Synapse/test_vol_h5/case' + subject + '.npy.h5'
-    #         f = h5py.File(fn, 'w')
-    #         dset = f.create_dataset('image', data=data_normalised)
-    #         dset = f.create_dataset('label', data=label_data)
-    #         f.close()
-
-
-if __name__ == '__main__':
+            with h5py.File(h5_filename, "w") as f:
+                f.create_dataset("image", data=normalised_image_data)
+                f.create_dataset("label", data=label_data)
+
+
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('original_dataset_dir', type=Path,
-                        help='The root directory for the downloaded, original dataset')
-    parser.add_argument('-td', '--target-dataset-dir', type=Path, default=Path('../../data'),
-                        help='The directory where the processed dataset should be stored.')
-    parser.add_argument('-lp', '--list-path', type=Path, default=Path('../lists/lists_Synapse/train.txt'),
-                        help='Path to one of the dataset lists that contain the case ids that should be used.')
-    parser.add_argument('--clip', nargs=2, type=float, default=[-125, 275],
-                        help='Two numbers [min max] that represent the interval that should be clipped from the '
-                             'original image data.')
+    parser.add_argument("original_dataset_dir", type=Path,
+                        help="The root directory for the downloaded, original dataset")
+    parser.add_argument("-td", "--target-dataset-dir", type=Path, default=Path("../../data"),
+                        help="The directory where the processed dataset should be stored.")
+    parser.add_argument("-fl", "--from-list-file", type=Path,
+                        help="Do not process all directories that are contained in the original dataset directory, "
+                             "but use those contained in the passed list file. The data in the list must be "
+                             "structured as in the train.txt file located in lists/lists_Synapse.")
+    parser.add_argument("--clip", nargs=2, type=float, default=[-125, 275],
+                        help="Two numbers [min max] that represent the interval that should be clipped from the "
+                             "original image data.")
+    parser.add_argument("--overwrite", action="store_true", default=False,
+                        help="Overwrite the data present in the target dataset directory")
     parsed_args = parser.parse_args()
     main(parsed_args)

From 19b7a6110890a7f9e661b1790d4d18cb190d8ee5 Mon Sep 17 00:00:00 2001
From: hendraet <hendrik.raetz@niederjahna.de>
Date: Wed, 18 Aug 2021 09:48:02 +0200
Subject: [PATCH 4/7] Minor fix

---
 datasets/preprocess_data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datasets/preprocess_data.py b/datasets/preprocess_data.py
index 1db74ccc..ddc000d2 100644
--- a/datasets/preprocess_data.py
+++ b/datasets/preprocess_data.py
@@ -12,7 +12,7 @@
 def get_case_ids_from_list(dataset_list_path: Path) -> List[str]:
     with open(dataset_list_path, "r") as f:
         slices = f.readlines()
-    case_ids = sorted(list(set([s.split("_")[0][4:] for s in slices])))
+    case_ids = sorted(list(set([s.split("_")[0][4:].rstrip() for s in slices])))
     return case_ids
 
 

From 9556e79ab8674e95935dc8b907ae839236323bc3 Mon Sep 17 00:00:00 2001
From: hendraet <hendrik.raetz@niederjahna.de>
Date: Thu, 19 Aug 2021 15:36:12 +0200
Subject: [PATCH 5/7] Adds synapsclient dependency to requirements

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index 49b8739f..a9e046ff 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,3 +10,4 @@ SimpleITK
 scipy
 h5py
 nibabel
+synapseclient

From a58563efa7c99ad84172f1cc668552d483a8b8b1 Mon Sep 17 00:00:00 2001
From: hendraet <hendrik.raetz@niederjahna.de>
Date: Mon, 30 Aug 2021 15:02:49 +0200
Subject: [PATCH 6/7] Removes TODO

---
 datasets/preprocess_data.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/datasets/preprocess_data.py b/datasets/preprocess_data.py
index ddc000d2..a6c5e645 100644
--- a/datasets/preprocess_data.py
+++ b/datasets/preprocess_data.py
@@ -8,7 +8,6 @@
 from tqdm import tqdm
 
 
-# TODO: specify in README which exact dataset has to be downloaded
 def get_case_ids_from_list(dataset_list_path: Path) -> List[str]:
     with open(dataset_list_path, "r") as f:
         slices = f.readlines()

From 3fb1c04881bf9862a9e730ec61622596b5ce1395 Mon Sep 17 00:00:00 2001
From: hendraet <hendrik.raetz@niederjahna.de>
Date: Fri, 3 Sep 2021 13:23:10 +0200
Subject: [PATCH 7/7] Fixes error in test data preprocessing

---
 datasets/preprocess_data.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/datasets/preprocess_data.py b/datasets/preprocess_data.py
index a6c5e645..ed03b6d9 100644
--- a/datasets/preprocess_data.py
+++ b/datasets/preprocess_data.py
@@ -44,8 +44,8 @@ def main(args: argparse.Namespace):
             clipped_image_data = numpy.clip(image_data, *args.clip)
             normalised_image_data = (clipped_image_data - args.clip[0]) / (args.clip[1] - args.clip[0])
 
-            # Reorders data so that the channel dimension is at the front for easier iteration in the subsequent
-            # for-loop
+            # Reorders data so that the channel/slice dimension is in front. Makes iteration easier when processing
+            # slices and when using the h5 file for testing
             transposed_image_data = numpy.transpose(normalised_image_data, (2, 0, 1))
             transposed_label_data = numpy.transpose(label_data, (2, 0, 1))
 
@@ -53,9 +53,9 @@ def main(args: argparse.Namespace):
             for i, (image_slice, label_slice) in tqdm(enumerate(zip(transposed_image_data, transposed_label_data)),
                                                       desc="Processing slices", leave=False):
                 out_filename = args.target_dataset_dir / f"Synapse/train_npz/case{case_id}_slice{i:03d}.npz"
+
                 if not args.overwrite and out_filename.exists():  # Do not overwrite data unless flag is set
                     continue
-
                 if not out_filename.parent.exists():
                     out_filename.parent.mkdir(exist_ok=True, parents=True)
                 numpy.savez(out_filename, image=image_slice, label=label_slice)
@@ -67,8 +67,8 @@ def main(args: argparse.Namespace):
             if not h5_filename.parent.exists():
                 h5_filename.parent.mkdir(exist_ok=True, parents=True)
             with h5py.File(h5_filename, "w") as f:
-                f.create_dataset("image", data=normalised_image_data)
-                f.create_dataset("label", data=label_data)
+                f.create_dataset("image", data=transposed_image_data)
+                f.create_dataset("label", data=transposed_label_data)
 
 
 if __name__ == "__main__":