diff --git a/recognition/47049358/.gitignore b/recognition/47049358/.gitignore new file mode 100644 index 000000000..410d378be --- /dev/null +++ b/recognition/47049358/.gitignore @@ -0,0 +1,20 @@ +semantic_labels_anon +semantic_MRs_anon + +**__pycache__** + +*.ipynb + +*.pkl + +Images + +rangpur_outputs + +*.sh + +*.out + +*.pdf + +BriefDataDescription.txt \ No newline at end of file diff --git a/recognition/47049358/README.md b/recognition/47049358/README.md new file mode 100644 index 000000000..83ac23de6 --- /dev/null +++ b/recognition/47049358/README.md @@ -0,0 +1,201 @@ +--- +title: COMP3710 Report +author: "Ryuto Hisamoto" +date: "2024-10-25" +--- + +# Table of Contents + +- [Table of Contents](#table-of-contents) +- [Improved 3D UNet](#improved-3d-unet) + - [Problem](#problem) + - [Model](#model) +- [Loading Data](#loading-data) +- [Training](#training) + - [Loss Function](#loss-function) + - [Optimiser](#optimiser) +- [Testing](#testing) +- [Result](#result) +- [Discussion](#discussion) +- [Conclusion](#conclusion) +- [References](#references) +- [Dependencies](#dependencies) + +# Improved 3D UNet + +Improved 3D UNet is capable of producing segmentations for medical images. The report covers the architecture of model,its parameters and relevant components, and its performance on 3D prostate data. + +## Problem + +Segmentation is a task that requires a machine learning models to divide image components into meaningful parts. +In other words, the model is required to classify components of an image correctly into corresponding labels. + +## Model + +[`modules.py`](modules.py) + +
+
+
+ Figure 1: Improved 3D UNet Architecture
+
+ +UNet is an architecture for convolutional neural networks specifically for segmentation tasks (Gupta, 2021). +The model takes advantage of skip connections and tensor concatenations to preserve input details and its structure while learning appropriate segmentations. +The basic structure of UNet involves the downsampling and upsampling of original images with skip connections in between corresponding pair of downsampling and upsampling layers. +Skip connection is a technique used to (1) preserve features of the image and (2) prevent diminishing gradients over deep layers of network preventing the learning of parameters (PATHAK, 2024). +The authors of "Brain Tumor Segmentation and Radiomics Survival Prediction: Contribution to the BRATS 2017 Challenge" proposes the improvement on the architecture by the integration of segmentation layers at different levels. The resulting architecture is improved 3D UNet which is capable of performing complex segmentation tasks with appropriate parameters and components. + +In the context pathway (encoding part), 3 x 3 x 3 convolution with a stride and padding of 1 is applied in each convolutional layer. Then, instance normalisation is applied and its output groes through leaky ReLU with a negative slope of $10 ^ {-2}$ as an activation function. We refer to this module as a 'standard module', and 3 x 3 x 3 stride 2 convolution is the same as a standard module except its stride being 2 to reduce the resolution of input.Context modules are composed of two standard modules with a drop out layer in-between with a dropout probability of 30%. This helps in reducing computational cost and memory requirements. Lastly, output from context modules are combined with its input passed from a standard module with element-wise sum. From the 2nd level, the depth of layers is doubled, and the process is repeated throughout each level of the context pathway. + +The localisation pathway (decoding part) utilises a 4 x 4 x 4 transposed convolution with a stride of 2 and padding of 1 to increase the resolution while reducing the feature maps. As the input goes up layers, they are concatenated with the output from a context module on the same layer to preserve features which are potentially lost as they go through the network. Then, localisation modules combines the features together while reducing the number of feature maps to reduce memory consumption. Its output is handed over to the following upsampling module, and the process is repeated until it reaches back to the original level of the architecture. When the input reaches to the original level, it goes through another standard module before handed over to a segmentation layer and is summed with the previsous outputs of segmentation layers. + +From the third localisation layer, segmentation layers which apply 1 x 1 x 1 convolution with a stride of 1 take outputs from localisation modules and map them to the corresponding segmentation labels and are summed element-wise after upscaled to match the size. Finally the output is applied a softmax to turn its predictions of labels into probabilities for later calculation of loss. It is to be noted that, argmax has to be applied to produce proper masks from the output of the model. Otherwise, the model produces its predictions from the architecture and processed discussed. + +# Loading Data + +[`dataset.py`](dataset.py) + +The authors of "Brain Tumor Segmentation and Radiomics Survival Prediction: Contribution to the BRATS 2017 Challenge" seem to have used the following augmentation methods: + +- Random rotation +- Random scaling +- Elastic transformation +- Gamma correction +- Mirroring (assumably horizontal flip given their problem space) + +However, some augmentations methods are altered to limit the complexity of solution. For instance, use of elastic transformation was avoided as it could alter the image significantly, causing it to deviate from the actual images the model may find. Moreover, the tuning of such complex method could decrease the maintainability of solutin. Therefore, the project preserved basic augmentation techniques to process the training data. More precisely, techniques used are limited to: + +- Random Rotation ($[-0.5, 0.5]$ for all x, y, and z coordinates) +- Random Vertical Flip +- Gaussian Noise ($\mu = 0, \sigma = 0.5$) +- Resizing (down to (128 x 128 x 64)) + +Resizing is an optional transformation as it is meant to be done to save the memory consumption and increase the speed of training. **However, resizing must be applied with limited memory resources to run the model**. The past attempts have shown that the implementation cannot process image size of (256 x 256 x 128) regardless of batch size. In addition, all images are normalised as they are loaded to eliminate difference in intensity scales if there are any. Finally, all voxel values are loaded as `torch.float32` but `torch.uint8` is used for labels to save memory consumption. The labels in the dataset are indexed according to the table below, and are assigned to corresponding layers as binary masks as they are on-hot encoded. For example, segments corresponding to the first type of label appears as 1s in the 0th layer while other parts appear as 0s, and so on. + + |Labels|Segment| +| - | - | +|0| Background | +|1| Body | +|2| Bones | +|3| Bladder | +|4| Rectum | +|5| Prostate | + + +
+
+
+ Figure 2: Example of labels layered on top of images.
+
+ +# Training + +[`train.py`](train.py) + +- Batch Size: 2 +- Number of Epochs: 300 +- Learning Rate: $5e ^ {-4}$ +- Initial Learning Rate (for lr_scheduler): 0.985 +- Weight Decay: $1e ^ {-5}$ + +The model takes in an raw image as its input, and its goal is to learn the best feature map which ends up being a multi-channel segmentation of the original image. + +## Loss Function + +The model utilises dice loss as its loss function. Moreover, it is capable of using deviations of dice loss such as a sum of dice loss and cross-entropy loss, or focal loss. A vanilla dice score has formula: $$D(y_{true}, y_{pred}) = 2 \times \frac{\Sigma(y_{true} \cdot y_{pred})}{\Sigma y_{true} + \Sigma y_{pred}}$$ + +in which $y_{true}$ is the ground truth probability and $y_{pred}$ is the predicted probability. Hence dice loss is provided by: + +$$L_{Dice} = 1 - D(y_{true}, y_{pred})$$ + +The loss function mitigates the problem with other loss functions such as a cross-entropy loss which tend to be biased toward a dominant class. The design of dice loss provides more accurate representation of the model's performance in segmentation. In addition `monai` provides an option to exclude background from the calculation of loss, and the model makes use of this option when calculating the loss (background is included when testing). + +It is recommended to use the sum of dice loss and a weighted cross-entropy loss (Yang et al., 2022) for the problem as it seems to optimise the performance the most. Cross-entropy loss is calculating by: + +$$L_{CE} = \frac{1}{N} \Sigma_i - [y_i \times \ln (p_i) + (1 - y_i) \times \ln (1 - p_i)]$$ + +where $y_i$ is the lebel of sample $i$ and $p_i$ represents the probability of sample $i$ predicted to be positive, and $N$ represents the number of samples. Hence the its wegithed sum with a dice loss can be shown as + +$$L_{loss} = L_{Dice} + \alpha L_{CE}$$ + +"Multi-task thyroid tumor segmentation based on the joint loss function" recommends to set $\alpha = 0.2$, so the report strictly follows it to calculate the weighted loss. + +## Optimiser + +**Adam (Adaptive Moment Estimation)** is an optimisation algorithm that boosts the speed of convergence of gradient descent. The optimiser utilises an exponential average of gradients, which allows its efficient and fast pace of convergence. Moreover, the optimiser applies a **$L_2$ regularisation** (aka Tikhonov regularisation) to penalise for the complexity of model. Complexity can be defined as the number of parameters learned from the data, and high complexity is likely to be an indication of overfitting to the training samples. Hence, regularisation is necessary to prevent the model from learning high values of parameters by penalising the model for its complexity, and $L_2$ regularisation is one of the explicit regularisation methods which adds an extra penalty term to the cost function. The parameters learned with such technique can be denoted as + +$$\hat{\theta} = \arg \min_\theta \frac{1}{n} ||X\theta - y||^ 2_ 2 + \lambda ||\theta|| ^ 2 _ 2$$ + +In addition, the model utilises a learning rate scheduler based on the number of epochs, which dynamically changes the learning rate over epochs. This allows the model to start from a large learning rate which evntually settles to a small learning rate for easier convergence. In the implementation, the learnign rate is reduced by $1e ^ {-5}$ over each epoch. + +It is to be noted that mixed precision and gradient accumulation are used to reduce the memory consumption during the training. **Mixed precision** reduces the memory consumption by replaceing value types with `torch.float16` where it can to reduce the space required to perform necessary operations including loss and gradient calculations necessary to train the model. **Gradient accumulation** accumulates the gradients and updates the weights after some training loop. + +# Testing + +[`predict.py`](predict.py) + +The model is tested by measuring its dice scores on the segmentations it produces for unseen images. Although the model outputs softmax values for its predicted segmentations, they are one-hot encoded during the test to maximise the contribution of correct predictions. Dice scores for each label is calculated independently to obtain the accurate performance to analyse the model's weakness and strengths in predicting particular segments for all labels. Then, their averages are taken and are summarised in the bar chart. Moreover, the visualisation of first 9 labels are produced with the actual segmentations for comparison. + +# Result + +
+
+
+ Figure 3: The Training Progress with Dice Loss + 0.2CE
+
+ +
+
+
+ Figure 4: Example of Ground Truth Labels used for Testing
+
+ +
+
+
+ Figure 5: Example of Predicted Labels produced by the model
+
+ +
+
+
+ Figure 6: The Final Dice Scores achieved by the Model for Each Label
+
+
+The outcome shows the significant impact of the choice of loss function in the performance of model. It was found that with other loss functions, the model performs poorly on assigning correct labels to small segments. Specifically, segment label 4 (rectum) often suffered from poor performance as it was often ignored by the model in optimising the segmentaion of corresponding label. However, the addition of weighted cross-entropy loss seem to enforce the model to classify segments correctly, which might seem to cause a tremendous improvement in performance. The final model produces segment predictions with dice scores greater than 0.8 each, which is an astonishing performance from where it started off.
+
+# Discussion
+
+Firstly, there had to be a compromise in maintaining the original resolution of the image given the limiation in resources. The model seem to perform well on downsized images, but without testing it on images with original resolution, its performance on original images can only be estimated. Moreover, the optimality of architecture remain as a question as the model could be potentially simplified to perform the same task without facing issues in its large consumption of computer memory.
+
+Secondly, the project did not incorporate the idea of pateient-level predictions. Despite the model's strong performance, its true robustness to scans taken from new patients must be explored to test its true ability to produce segmentations. In future, the model has to be tested for its capability by training it based on patient-level images.
+
+Finally, although the report strictly followed the implementation of the architectures and loss functions from the published papers with different problem space, there could be more optimal or efficient adjustments that could improve the model's performance in terms of accuracy and time and/or memory savings. Therefore, future research
+could focus on the improvement of current model with differentiations from the architectures and components already mentioned by researchers for new discoveries.
+
+# Conclusion
+
+Improved 3D UNet is a powerful architecture which makes complex image-processing tasks possible. However, its performance is truly maximised through the observation of its behaviour and performance under different settings, tunings, and/or parameter selections. In the given problem of segmenting 3D prostate images, adjusting the loss function from a vanilla dice loss to the sum of dice loss and weighted cross-entropy loss improved the performance dramatically. The model could be explored in depth in regards to its relationship with its components for improved performance, which could potentially lead to a discovery of new and more generalised architectures that could function in wider
+
+# References
+
+1. Gupta, P. (2021, December 17). Understanding Skip Connections in Convolutional Neural Networks using U-Net Architecture. Medium. https://medium.com/@preeti.gupta02.pg/understanding-skip-connections-in-convolutional-neural-networks-using-u-net-architecture-b31d90f9670a
+
+2. Isensee, F., Kickingereder, P., Wick, W., Bendszus, M., & Maier-Hein, K. (2018). Brain Tumor Segmentation and Radiomics Survival Prediction: Contribution to the BRATS 2017 Challenge. In arXiv. https://arxiv.org/pdf/1802.10508v1
+
+3. PATHAK, H. (2024, July 21). How do skip connections impact the training process of neural networks? Medium. https://medium.com/@harshnpathak/how-do-skip-connections-impact-the-training-process-of-neural-networks-bccca6efb2eb
+
+4. Yang, D., Li, Y., & Yu, J. (2022). Multi-task thyroid tumor segmentation based on the joint loss function. Biomedical Signal Processing and Control, 79(2). https://doi.org/10.1016/j.bspc.2022.104249
+
+# Dependencies
+
+- matplotlib=3.9.2
+- monai=1.4.0
+- nibabel=5.3.2=pypi_0
+- pytorch=2.5.0
+- scikit-learn=1.5.2=pypi_0
+- torchaudio=2.5.0
+- torchvision=0.20.0
+
+_\*For more details, please refer to the [`requirements.txt`](requirements.txt)._
\ No newline at end of file
diff --git a/recognition/47049358/dataset.py b/recognition/47049358/dataset.py
new file mode 100644
index 000000000..7a0ae82d9
--- /dev/null
+++ b/recognition/47049358/dataset.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python
+""" Initialises monai transformations and loads paths to image and label nifti files.
+
+dataset.py loads the files to perform image segmentation. It loads the paths to images and corresponding
+labels, but none of them are processed in the file. Moreover, transformation on training set and test set
+are defined in the file, but they are to be exported with corresponding dictionary files and used with
+monai.data.Dataset and Dataloader.
+
+"""
+
+# ==========================
+# Imports
+# ==========================
+import os
+from sklearn.model_selection import train_test_split
+from monai.transforms import (LoadImaged, EnsureChannelFirstd, NormalizeIntensityd,
+ SpatialCropd, RandFlipd, RandRotated, AsDiscreted,
+ RandGaussianNoised, Compose, CastToTyped, Resized)
+import torch
+
+__author__ = "Ryuto Hisamoto"
+
+__license__ = "Apache"
+__version__ = "1.0.0"
+__maintainer__ = "Ryuto Hisamoto"
+__email__ = "s4704935@student.uq.edu.au"
+__status__ = "Committed"
+
+# ==========================
+# Constants
+# ==========================
+
+IMAGE_FILE_NAME = '/home/groups/comp3710/HipMRI_Study_open/semantic_MRs' # on rangpur
+LABEL_FILE_NAME = '/home/groups/comp3710/HipMRI_Study_open/semantic_labels_only' # on rangpur
+
+# IMAGE_FILE_NAME = os.path.join(os.getcwd(), 'semantic_MRs_anon') # assuming folders are in the cwd
+# LABEL_FILE_NAME = os.path.join(os.getcwd(), 'semantic_labels_anon')
+
+rawImageNames = sorted(os.listdir(IMAGE_FILE_NAME))
+rawLabelNames = sorted(os.listdir(LABEL_FILE_NAME))
+
+# Split the set into train, validation, and test set (80 : 20 for train:test)
+train_images, test_images, train_labels, test_labels = train_test_split(rawImageNames, rawLabelNames, train_size=0.8) # Split the data in training and test set
+
+"""
+A transformation is performed in for consistent dimensions in each images and labels, and random augmentation
+of files to prevent the model's overfitting to the training set. They are performed in the order of: loading, cropping (to remove extra dimensions),
+normalisation of voxel values, random vertical flip (spatial_axis = 2), random rotation (of small degrees), and
+addition of random noise. For labels, an extra step to change encodings is applied.
+"""
+
+train_transforms = Compose(
+ [
+ LoadImaged(keys=["image", "label"]),
+ EnsureChannelFirstd(keys=["image", "label"]),
+ SpatialCropd(keys=["image", "label"], roi_slices=[slice(None), slice(None), slice(0, 128)]), # Crop to depth 128
+ NormalizeIntensityd(keys=["image"]),
+ Resized(keys=["image", "label"], spatial_size=(128, 128, 64)),
+ RandFlipd(keys=["image", "label"], spatial_axis=2, prob=0.5),
+ RandRotated(keys=["image", "label"], range_x=0.5, range_y=0.5, range_z=0.5, mode='nearest', prob=0.5),
+ RandGaussianNoised(keys=["image"], prob=0.5, mean=0, std=0.5),
+ AsDiscreted(keys=["label"], to_onehot=6),
+ CastToTyped(keys=["label"], dtype=torch.uint8),
+ ]
+)
+
+"""
+A transformation on test set involved the loading on images and labels, cropping for consistent dimensions,
+normalisation of voxel values and encoding of labels.
+"""
+
+test_transforms = Compose(
+ [
+ LoadImaged(keys=["image", "label"]),
+ EnsureChannelFirstd(keys=["image", "label"]),
+ SpatialCropd(keys=["image", "label"], roi_slices=[slice(None), slice(None), slice(0, 128)]),
+ NormalizeIntensityd(keys=["image"]),
+ Resized(keys=["image", "label"], spatial_size=(128, 128, 64)),
+ AsDiscreted(keys=["label"], to_onehot=6),
+ CastToTyped(keys=["label"], dtype=torch.uint8),
+ ]
+)
+
+# Loads paths to images and labels, but do not process them yet
+
+train_dict = [{"image": os.path.join(IMAGE_FILE_NAME, image), "label": os.path.join(LABEL_FILE_NAME, label)}
+ for image, label in zip(train_images, train_labels)]
+test_dict = [{"image": os.path.join(IMAGE_FILE_NAME, image), "label": os.path.join(LABEL_FILE_NAME, label)}
+ for image, label in zip(test_images, test_labels)]
\ No newline at end of file
diff --git a/recognition/47049358/documentation/dice_coefs_test_dice_ce_loss.png b/recognition/47049358/documentation/dice_coefs_test_dice_ce_loss.png
new file mode 100644
index 000000000..98c4671d5
Binary files /dev/null and b/recognition/47049358/documentation/dice_coefs_test_dice_ce_loss.png differ
diff --git a/recognition/47049358/documentation/example_labels_and_images.png b/recognition/47049358/documentation/example_labels_and_images.png
new file mode 100644
index 000000000..7fc154c95
Binary files /dev/null and b/recognition/47049358/documentation/example_labels_and_images.png differ
diff --git a/recognition/47049358/documentation/ground_truths_dice_ce_loss.png b/recognition/47049358/documentation/ground_truths_dice_ce_loss.png
new file mode 100644
index 000000000..d73fea031
Binary files /dev/null and b/recognition/47049358/documentation/ground_truths_dice_ce_loss.png differ
diff --git a/recognition/47049358/documentation/model_architecture.png b/recognition/47049358/documentation/model_architecture.png
new file mode 100644
index 000000000..df02206be
Binary files /dev/null and b/recognition/47049358/documentation/model_architecture.png differ
diff --git a/recognition/47049358/documentation/predictions_dice_ce_loss.png b/recognition/47049358/documentation/predictions_dice_ce_loss.png
new file mode 100644
index 000000000..437e3a75d
Binary files /dev/null and b/recognition/47049358/documentation/predictions_dice_ce_loss.png differ
diff --git a/recognition/47049358/documentation/unet_dice_coefs_over_epochs_dice_ce_loss.png b/recognition/47049358/documentation/unet_dice_coefs_over_epochs_dice_ce_loss.png
new file mode 100644
index 000000000..aec1db5a7
Binary files /dev/null and b/recognition/47049358/documentation/unet_dice_coefs_over_epochs_dice_ce_loss.png differ
diff --git a/recognition/47049358/modules.py b/recognition/47049358/modules.py
new file mode 100644
index 000000000..0edacf759
--- /dev/null
+++ b/recognition/47049358/modules.py
@@ -0,0 +1,281 @@
+#!/usr/bin/env python
+"""
+The model and its building blocks of 3d Improved UNet
+"""
+import torch
+import torch.nn as nn
+
+__author__ = "Ryuto Hisamoto"
+
+__license__ = "Apache"
+__version__ = "1.0.0"
+__maintainer__ = "Ryuto Hisamoto"
+__email__ = "s4704935@student.uq.edu.au"
+__status__ = "Committed"
+
+NEGATIVE_SLOPE = 10 ** -2
+DROP_PROB = 0.3
+NUM_SEGMENTS = 6
+
+""" The most standard module which contains the 3 x 3 x 3 convolutional operation as with the normalisation
+ of the values and activations with Leaky ReLU. Instance normalisation is affine-enabled.
+
+ Parameters:
+ - in_channels (int): Number of input channels.
+ - out_channels (int): Number of output channels.
+ - kernel_size (int, optional): Size of the convolutional kernel (default is 3).
+ - stride (int, optional): Stride of the convolution operation (default is 1).
+ - padding (int, optional): Padding size for the convolution (default is 1).
+ - inplace (bool, optional): Whether to perform operations in place.
+"""
+class StandardModule(nn.Module):
+ def __init__(self, in_channels, out_channels,
+ kernel_size = 3, stride = 1, padding = 1, inplace = False):
+ super(StandardModule, self).__init__()
+ self.conv = nn.Conv3d(in_channels = in_channels, out_channels = out_channels,
+ kernel_size = kernel_size, stride = stride, padding = padding)
+ self.instance_norm = nn.InstanceNorm3d(out_channels, affine=True)
+ self.l_relu = nn.LeakyReLU(negative_slope=NEGATIVE_SLOPE, inplace = inplace)
+
+ def forward(self, x):
+ x = self.conv(x)
+ x = self.instance_norm(x)
+ x = self.l_relu(x)
+ return x
+
+""" Context module which functions as a pre-activation residual block with 2 StandardModules
+
+ Parameters:
+ - in_channels (int): Number of input channels.
+ - out_channels (int): Number of output channels.
+"""
+class ContextModule(nn.Module):
+ def __init__(self, in_channels, out_channels):
+ super(ContextModule, self).__init__()
+ self.block1 = StandardModule(in_channels = in_channels, out_channels = out_channels, inplace = True)
+ self.dropout = nn.Dropout(DROP_PROB)
+ self.block2 = StandardModule(in_channels = in_channels, out_channels = out_channels, inplace = True)
+
+ def forward(self, x):
+ x = self.block1(x)
+ x = self.dropout(x)
+ x = self.block2(x)
+ return x
+
+""" A module that applies 3 x 3 x 3 convolution and following operations excpet with the stride of 2. All encoding layers
+utilise this after the first one.
+
+ Parameters:
+ - in_channels (int): Number of input channels.
+ - out_channels (int): Number of output channels.
+ - kernel_size (int, optional): Size of the convolutional kernel (default is 3).
+ - stride (int, optional): Stride of the convolution operation (default is 2).
+ - padding (int, optional): Padding size for the convolution (default is 1).
+ - inplace (bool, optional): Whether to perform operations in place.
+"""
+class Stride2Module(nn.Module):
+ def __init__(self, in_channels, out_channels,
+ kernel_size=3, stride=2, padding=1, inplace = False):
+ super(Stride2Module, self).__init__()
+ self.conv = nn.Conv3d(in_channels = in_channels, out_channels = out_channels,
+ kernel_size = kernel_size, stride = stride, padding = padding)
+ self.instance_norm = nn.InstanceNorm3d(out_channels, affine=True)
+ self.l_relu = nn.LeakyReLU(negative_slope=NEGATIVE_SLOPE, inplace = inplace)
+
+ def forward(self, x):
+ x = self.conv(x)
+ x = self.instance_norm(x)
+ x = self.l_relu(x)
+ return x
+
+""" A module that upsamples (decodes) from the bottom-most layer using a convolutional transpose.
+The module is used throughout the localisation pathway to take featres from lower levels of the network that encode
+contextual information at low spatial resolution and transfer that information to a higher spatial resolution.
+
+ Parameters:
+ - in_channels (int): Number of input channels.
+ - out_channels (int): Number of output channels.
+ - kernel_size (int, optional): Size of the convolutional kernel (default is 4).
+ - stride (int, optional): Stride of the convolution operation (default is 2).
+ - padding (int, optional): Padding size for the convolution (default is 1).
+ - inplace (bool, optional): Whether to perform operations in place.
+"""
+class UpsamplingModule(nn.Module):
+ def __init__(self, in_channels, out_channels):
+ super(UpsamplingModule, self).__init__()
+ self.conv_transpose = nn.ConvTranspose3d(in_channels = in_channels, out_channels = out_channels,
+ kernel_size = 4, stride = 2, padding = 1)
+ self.block = StandardModule(in_channels = out_channels, out_channels = out_channels, inplace = True)
+
+ def forward(self, x):
+ x = self.conv_transpose(x)
+ x = self.block(x)
+ return x
+
+""" Localisation modules that consists of a 3 x 3 x 3 convolution followed by a 1 x 1 x 1 convolution that halves the
+number of feature maps. It acccepts the concatenated features from the skip connection and recombines them together.
+
+ Parameters:
+ - in_channels (int): Number of input channels.
+ - out_channels (int): Number of output channels.
+"""
+class LocalisationModule(nn.Module):
+ def __init__(self, in_channels, out_channels):
+ super(LocalisationModule, self).__init__()
+
+ self.block1 = StandardModule(in_channels = in_channels, out_channels = out_channels)
+
+ self.block2 = StandardModule(in_channels = out_channels, out_channels = out_channels, kernel_size = 1, padding = 0)
+
+ def forward(self, x):
+ x = self.block1(x)
+ x = self.block2(x)
+ return x
+
+""" A segmentation layer that is integrated at different levels of the network, which are combined via elementwise summation
+to form the final network output.
+
+ Parameters:
+ - in_channels (int): Number of input channels.
+ - out_channels (int): Number of output channels.
+"""
+class SegmentationLayer(nn.Module):
+ def __init__(self, in_channels, out_channels):
+ super(SegmentationLayer, self).__init__()
+ self.seg = nn.Conv3d(in_channels = in_channels, out_channels = out_channels,
+ kernel_size = 1, stride = 1, padding = 0)
+
+ def forward(self, x):
+ return self.seg(x)
+
+""" A module that upscales the input for 2 times. The module is to be used to match the scale of feature maps
+of segmentation layers from different levels of the network.
+
+ Parameters:
+ - in_channels (int): Number of input channels.
+ - out_channels (int): Number of output channels.
+"""
+
+class UpScaleModule(nn.Module):
+ def __init__(self, in_channels, out_channels):
+ super(UpScaleModule, self).__init__()
+ self.upscale = nn.ConvTranspose3d(in_channels = in_channels, out_channels = out_channels,
+ kernel_size = 4, stride = 2, padding = 1)
+
+ def forward(self, x):
+ return self.upscale(x)
+
+""" 3D imporoved UNet that produces segmentations by first aggregating high level information by
+context pathway and localising precisely in the localisation pathway.
+"""
+class ImprovedUnet(nn.Module):
+ def __init__(self):
+ super(ImprovedUnet, self).__init__()
+ self.block1 = StandardModule(1, 16) # Grayscale thus requries 1 input channel
+ self.context1 = ContextModule(16, 16)
+
+ self.block2 = Stride2Module(16, 32)
+ self.context2 = ContextModule(32, 32)
+
+ self.block3 = Stride2Module(32, 64)
+ self.context3 = ContextModule(64, 64)
+
+ self.block4 = Stride2Module(64, 128)
+ self.context4 = ContextModule(128, 128)
+
+ self.block5 = Stride2Module(128, 256)
+ self.context5 = ContextModule(256, 256)
+
+ self.upsample1 = UpsamplingModule(256, 128)
+
+ self.localise1 = LocalisationModule(256, 128)
+ self.upsample2 = UpsamplingModule(128, 64)
+
+ self.localise2 = LocalisationModule(128, 64)
+ self.upsample3 = UpsamplingModule(64, 32)
+
+ self.localise3 = LocalisationModule(64, 32)
+ self.upsample4 = UpsamplingModule(32, 16)
+
+ self.conv_output = StandardModule(32, 32)
+
+ # first segmentation layer
+ self.segmentation1 = SegmentationLayer(64, NUM_SEGMENTS)
+
+ # second segmentation layer
+ self.segmentation2 = SegmentationLayer(32, NUM_SEGMENTS)
+
+ # third segmentation layer
+ self.segmentation3 = SegmentationLayer(32, NUM_SEGMENTS)
+
+ # upscaling layers
+ self.upscale_1 = UpScaleModule(NUM_SEGMENTS, NUM_SEGMENTS)
+ self.upscale_2 = UpScaleModule(NUM_SEGMENTS, NUM_SEGMENTS)
+
+
+ def forward(self, x):
+
+ # Level 1 context pathway
+ conv_out_1 = self.block1(x)
+ context_out_1 = self.context1(conv_out_1)
+ element_sum_1 = conv_out_1 + context_out_1
+
+ # Level 2 context pathway
+ conv_out_2 = self.block2(element_sum_1)
+ context_out_2 = self.context2(conv_out_2)
+ element_sum_2 = conv_out_2 + context_out_2
+
+ # Level 3 context pathway
+ conv_out_3 = self.block3(element_sum_2)
+ context_out_3 = self.context3(conv_out_3)
+ element_sum_3 = conv_out_3 + context_out_3
+
+ # Level 4 context pathway
+ conv_out_4 = self.block4(element_sum_3)
+ context_out_4 = self.context4(conv_out_4)
+ element_sum_4 = conv_out_4 + context_out_4
+
+ # Level 5 context pathway
+ conv_out_5 = self.block5(element_sum_4)
+ context_out_5 = self.context5(conv_out_5)
+ element_sum_5 = conv_out_5 + context_out_5
+
+ # Level 0 localisation pathway
+ upsample_out_1 = self.upsample1(element_sum_5)
+
+ # Level 1 localisation pathway
+ concat_1 = torch.cat((element_sum_4, upsample_out_1), dim = 1)
+ localisation_out_1 = self.localise1(concat_1)
+ upsample_out_2 = self.upsample2(localisation_out_1)
+
+ # Level 2 localisation pathway
+ concat_2 = torch.cat((element_sum_3, upsample_out_2), dim = 1)
+ localisation_out_2 = self.localise2(concat_2)
+ upsample_out_3 = self.upsample3(localisation_out_2)
+
+ # Level 3 localisation pathway
+ concat_3 = torch.cat((element_sum_2, upsample_out_3), dim = 1)
+ localisation_out_3 = self.localise3(concat_3)
+ upsample_out_4 = self.upsample4(localisation_out_3)
+
+ # Level 4 localisation pathway
+ concat_4 = torch.cat((element_sum_1, upsample_out_4), dim = 1)
+ convoutput_out = self.conv_output(concat_4)
+
+ # 1st Segmentation Layer
+ segment_out_1 = self.segmentation1(localisation_out_2)
+ upscale_out_1 = self.upscale_1(segment_out_1)
+
+ # 2nd Segmentation Layer
+ segment_out_2 = self.segmentation2(localisation_out_3)
+ seg_sum_1 = upscale_out_1 + segment_out_2
+
+ # 3rd Segmentation Layer
+ upscale_out_2 = self.upscale_2(seg_sum_1)
+ segment_out_3 = self.segmentation3(convoutput_out)
+
+ final_sum = upscale_out_2 + segment_out_3
+
+ output = torch.softmax(final_sum, dim = 1)
+
+ return output
\ No newline at end of file
diff --git a/recognition/47049358/predict.py b/recognition/47049358/predict.py
new file mode 100644
index 000000000..d39a71bae
--- /dev/null
+++ b/recognition/47049358/predict.py
@@ -0,0 +1,261 @@
+"""
+The file contains a method to visualise and/or measure the performance of the trained model
+on unseen data.
+"""
+# libraries
+import torch
+import torch.nn as nn
+import numpy as np
+import matplotlib.pyplot as plt
+from time import time
+from monai.losses import DiceLoss
+from monai.data import DataLoader, Dataset
+from monai.transforms import (AsDiscrete, Compose, CastToType)
+
+# import from local files
+from train import trained_model, CRITERION, compute_dice_segments, DEVICE, CRITERION_NAME
+from dataset import test_dict, test_transforms
+
+__author__ = "Ryuto Hisamoto"
+
+__license__ = "Apache"
+__version__ = "1.0.0"
+__maintainer__ = "Ryuto Hisamoto"
+__email__ = "s4704935@student.uq.edu.au"
+__status__ = "Committed"
+
+def visualise_ground_truths(images: list, ground_truths: list, criterion_name: str):
+ """ Visualises the ground truths and their images by overlaying them on the same 3 x 3 plot.
+
+ Args:
+ images (list): Images to overlay labels on.
+ ground_truths (list): Labels to overlay on top of images.
+ criterion_name (str): Name of the loss function used during the training to name the plot.
+
+ Returns:
+ None: The function only plots, so it does not return any value.
+ """
+
+ # Create a 3x3 grid of subplots
+ fig, axes = plt.subplots(3, 3, figsize=(15, 15))
+
+ # Plot the images
+ for i in range(3):
+ for j in range(3):
+
+ idx = i * 3 + j
+
+ # Original image
+
+ image = images[idx]
+
+ axes[i, j].imshow(image, cmap='gray')
+ axes[i, j].axis('off')
+ axes[i, j].set_title(f'Image {idx+1}')
+
+ # Ground truth mask
+
+ ground_truth = ground_truths[idx]
+ num_masks = ground_truth.shape[0]
+
+ mask_gt = np.zeros((ground_truth.shape[1], ground_truth.shape[2]), dtype = np.uint8)
+
+ for k in range(num_masks):
+ mask_gt += (k + 1) * ground_truth[k, : , : ]
+ axes[i, j].imshow(mask_gt, cmap='jet', alpha=0.3)
+
+ # Show the plot
+ plt.tight_layout()
+ plt.savefig(f'ground_truths_{str(CRITERION_NAME)}.png')
+ plt.close()
+
+def visualise_predictions(images: list, predictions: list, criterion_name : str):
+ """Visualises the predictions and their images by overlaying them on the same 3 x 3 plot.
+
+ Args:
+ images (list): A list of images to lay predicted labels on
+ predictions (list): A list of predicted labels proeuced by the model
+ criterion (str): The name of loss function used during the training to name the plot.
+
+ Returns:
+ None: The function only plots, so it does not return any value.
+ """
+
+ # Create a 3x3 grid of subplots
+ fig, axes = plt.subplots(3, 3, figsize=(15, 15))
+
+ # Plot the images
+ for i in range(3):
+ for j in range(3):
+
+ idx = i * 3 + j
+
+ # Original image
+
+ image = images[idx]
+
+ axes[i, j].imshow(image, cmap='gray')
+ axes[i, j].axis('off')
+ axes[i, j].set_title(f'Image {idx+1}')
+
+ mask_pred = predictions[idx]
+
+ axes[i, j].imshow(mask_pred, cmap='jet', alpha=0.3)
+
+ # Show the plot
+ plt.tight_layout()
+ plt.savefig(f'predictions_{str(CRITERION_NAME)}.png')
+ plt.close()
+
+def test(model: nn.Module, test_loader: DataLoader, device: torch.device | str):
+ """The function which tests the model on unseen data stored in a DataLoader
+
+ Args:
+ model (nn.Module): A trained model that is to be tested.
+ test_loader (DataLoader): DataLoader instance which contains image data and their labels for the model
+ to compare its performance against.
+ device (torch.device | str): A device the training is based on.
+
+ Returns:
+ tuple: A tuple containing:
+ - np.array: An array of overall dice score for each test image and labels
+ - np.array: An array of segment 0 dice score for each test image and labels
+ - np.array: An array of segment 1 dice score for each test image and labels
+ - np.array: An array of segment 2 dice score for each test image and labels
+ - np.array: An array of segment 3 dice score for each test image and labels
+ - np.array: An array of segment 4 dice score for each test image and labels
+ - np.array: An array of segment 5 dice score for each test image and labels
+ """
+
+ model.to(device)
+ model.eval() # Set the model to evaluation mode
+
+ criterion = DiceLoss(batch = True)
+
+ test_dice_coefs = np.array([]) # stores dice scores.
+ seg_0_dice_coef = np.array([])
+ seg_1_dice_coef = np.array([])
+ seg_2_dice_coef = np.array([])
+ seg_3_dice_coef = np.array([])
+ seg_4_dice_coef = np.array([])
+ seg_5_dice_coef = np.array([])
+
+ images = []
+ ground_truths = []
+ predictions = []
+
+ output_transform = Compose(
+ [
+ AsDiscrete(to_onehot=6),
+ CastToType(dtype=torch.uint8),
+ ]
+)
+
+ with torch.no_grad():
+
+ for i, batch_data in enumerate(test_loader):
+ inputs, labels = (
+ batch_data["image"].to(device),
+ batch_data["label"].to(device),
+ )
+ outputs = model(inputs)
+ outputs = output_transform(torch.argmax(outputs, dim=1))[np.newaxis, : , : , : , :]
+ segment_coefs = compute_dice_segments(outputs, labels, device)
+ dice_loss = criterion(outputs, labels).item()
+
+ test_dice = 1 - dice_loss
+
+ if len(images) < 9:
+ image = inputs[0, 0 , : , : , 50].cpu().numpy()
+ images.append(image)
+ mask = labels[0, : , : , : , 50].cpu().numpy().astype(np.uint8)
+ ground_truths.append(mask)
+ prediction = torch.argmax(outputs[0, : , : , : , 50 ], dim = 0).cpu().numpy().astype(np.uint8)
+ predictions.append(prediction)
+
+ seg_0_dice_coef = np.append(seg_0_dice_coef, segment_coefs[0].item())
+ seg_1_dice_coef = np.append(seg_1_dice_coef, segment_coefs[1].item())
+ seg_2_dice_coef = np.append(seg_2_dice_coef, segment_coefs[2].item())
+ seg_3_dice_coef = np.append(seg_3_dice_coef, segment_coefs[3].item())
+ seg_4_dice_coef = np.append(seg_4_dice_coef, segment_coefs[4].item())
+ seg_5_dice_coef = np.append(seg_5_dice_coef, segment_coefs[5].item())
+
+ print(f'Test No.{i} - Overall Dice Coefficient: {test_dice}')
+
+ test_dice_coefs = np.append(test_dice_coefs, test_dice)
+
+ visualise_ground_truths(images, ground_truths, CRITERION_NAME)
+ visualise_predictions(images, predictions, CRITERION_NAME)
+
+ return test_dice_coefs, seg_0_dice_coef, seg_1_dice_coef, seg_2_dice_coef, seg_3_dice_coef, seg_4_dice_coef, seg_5_dice_coef
+
+def plot_dice(criterion_name : str, segment_coefs: np.array):
+ """ A method that plots a bar chart to visualise the performance of model on unseen data
+ for each label. It is meant to demonstrated how accurately the model produces segmentations
+ for each lebel.
+
+ Args:
+ criterion (str): The name of loss function used during the training to name the plot.
+ segment_coefs (np.array): an array containing dice scores for each segment at corresponding indices.
+ """
+
+ x_values = np.arange(len(segment_coefs)) # Generate x-values as indices
+
+ # Plot overall dice scores
+ plt.bar(x_values, segment_coefs)
+
+ plt.xlabel("Segment No.")
+ plt.ylabel("Dice Score")
+ plt.title("Dice Score for Each Segment")
+ plt.legend()
+ plt.grid(True)
+ plt.savefig(f'dice_coefs_test_{str(criterion_name)}.png')
+ plt.close()
+
+
+if __name__ == "__main__":
+ # connect to gpu
+
+ test_set = Dataset(test_dict, test_transforms)
+ test_loader = DataLoader(dataset = test_set, batch_size = 1)
+
+ print('> Start Testing')
+
+ start = time()
+
+ # perform predictions
+ dice_coefs, s0, s1, s2, s3, s4, s5 = test(model = trained_model, test_loader = test_loader,
+ device = DEVICE)
+
+ end = time()
+
+ elapsed_time = end - start
+
+ print(f"> Test completed in {elapsed_time:.2f} seconds")
+
+ average_dice = np.mean(dice_coefs)
+ print(f"Average Dice Coefficient: {average_dice:.4f}")
+
+ average_s0 = np.mean(s0)
+ print(f"Segment 0 Dice Coefficient: {average_s0:.4f}")
+
+ average_s1 = np.mean(s1)
+ print(f"Segment 1 Dice Coefficient: {average_s1:.4f}")
+
+ average_s2 = np.mean(s2)
+ print(f"Segment 2 Dice Coefficient: {average_s2:.4f}")
+
+ average_s3 = np.mean(s3)
+ print(f"Segment 3 Dice Coefficient: {average_s3:.4f}")
+
+ average_s4 = np.mean(s4)
+ print(f"Segment 4 Dice Coefficient: {average_s4:.4f}")
+
+ average_s5 = np.mean(s5)
+ print(f"Segment 5 Dice Coefficient: {average_s5:.4f}")
+
+ segment_coefs = np.array([average_s0, average_s1, average_s2, average_s3,
+ average_s4, average_s5])
+
+ # plot dice scores across the dataset.
+ plot_dice(CRITERION_NAME, segment_coefs)
\ No newline at end of file
diff --git a/recognition/47049358/requirements.txt b/recognition/47049358/requirements.txt
new file mode 100644
index 000000000..f0f59e104
--- /dev/null
+++ b/recognition/47049358/requirements.txt
@@ -0,0 +1,113 @@
+# This file may be used to create an environment using:
+# $ conda create --name