#mounting google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

#basic libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import zipfile
import os
import cv2
from PIL import Image
import random
from random import shuffle
import warnings
warnings.simplefilter("ignore")

#model selection, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

#deep learning training
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, BatchNormalization
from tensorflow.keras.layers import Conv2D,LeakyReLU,MaxPooling2D,Flatten
from tensorflow.keras.applications import VGG16
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import losses, backend
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import backend
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

#model metrics
from sklearn.metrics import classification_report, confusion_matrix

# Storing the path of the data file from the Google drive
path = '/content/drive/MyDrive/cell_images.zip'

# The data is provided as a zip file so we need to extract the files from the zip file
with zipfile.ZipFile(path, 'r') as zip_folder:

    zip_folder.extractall()

#setting train directory
train_dir = '/content/cell_images/train'
#setting test directory
test_dir = '/content/cell_images/test'

train_images = []
train_labels = []
test_images = []
test_labels = []

#iterating through folders
for folder_name in ['/parasitized/', '/uninfected/']:

    #folder path
    images_path = os.listdir(train_dir + folder_name)

    for i, image_name in enumerate(images_path):

        try:
            #image path
            image = Image.open(train_dir + folder_name + image_name)

            #resizing image
            image = image.resize((64, 64))

            #converting images to arrays, appending to list
            train_images.append(np.array(image))

            #labels for parasitised, uninfected
            if folder_name == '/parasitized/':
                train_labels.append(1)
            else:
                train_labels.append(0)
        except Exception:
            pass

#converting lists to arrays
train_images = np.array(train_images)
train_labels = np.array(train_labels)

#iterating through folders
for folder_name in ['/parasitized/', '/uninfected/']:

    #folder path
    images_path = os.listdir(test_dir + folder_name)

    for i, image_name in enumerate(images_path):

        try:
            #image path
            image = Image.open(test_dir + folder_name + image_name)

            #resizing image
            image = image.resize((64, 64))

            #converting images to arrays, appending to list
            test_images.append(np.array(image))

            #labels for parasitised, uninfected
            if folder_name == '/parasitized/':
                test_labels.append(1)
            else:
                test_labels.append(0)
        except Exception:

            pass

#converting lists to arrays
test_images = np.array(test_images)
test_labels = np.array(test_labels)

#printing set shapes to confirm number of images
print("Train data shape:", train_images.shape)
print("Test data shape:", test_images.shape)

Train data shape: (24958, 64, 64, 3)
Test data shape: (2600, 64, 64, 3)

#printing set shapes to confirm number of images
print("Train labels shape:", train_labels.shape)
print("Test labels shape:", test_labels.shape)

Train labels shape: (24958,)
Test labels shape: (2600,)

#check min and max pixel values for train images
train_min = np.amin(train_images)
train_max = np.amax(train_images)

#check min and max pixel values for test images
test_min = np.amin(test_images)
test_max = np.amax(test_images)

#printing output
print("Train Images:\n Min pixel value:", train_min, "\n Max pixel value:", train_max)
print("\nTest Images:\n  Min pixel value:", test_min, "\n  Max pixel value:", test_max)

Train Images:
 Min pixel value: 0 
 Max pixel value: 255

Test Images:
  Min pixel value: 0 
  Max pixel value: 255

#count number of values in each set
train_count = train_images.size
test_count = test_images.size

#printing output
print("Number of values in train images:", train_count)
print("Number of values in test images:", test_count)

Number of values in train images: 306683904
Number of values in test images: 31948800

#normalise train images, convert to float32
train_images = (train_images/255).astype('float32')

#normalise test images, convert to float32
test_images = (test_images / 255).astype('float32')

#count occurrences of each label in the training set, print output
train_label_counts = np.bincount(train_labels)
print('Number of uninfected images in train data:',train_label_counts[0])
print('Number of parasitised images in train data:',train_label_counts[1])

#count occurrences of each label in the test set, print output
test_label_counts = np.bincount(test_labels)
print('\nNumber of uninfected images in test data:',test_label_counts[0])
print('Number of parasitised images in test data:',test_label_counts[1],'\n')

#plotting distributions
plt.figure(figsize=(10, 5))

#plotting for train data
plt.subplot(1, 2, 1)
plt.bar(range(len(train_label_counts)), train_label_counts, color='blue')
plt.title('Train Data Distribution')
plt.xlabel('Label')
plt.ylabel('Count')
plt.xticks(range(len(train_label_counts)))

#plotting for test data
plt.subplot(1, 2, 2)
plt.bar(range(len(test_label_counts)), test_label_counts, color='red')
plt.title('Test Data Distribution')
plt.xlabel('Label')
plt.ylabel('Count')
plt.xticks(range(len(test_label_counts)))

plt.tight_layout()
plt.show()

Number of uninfected images in train data: 12376
Number of parasitised images in train data: 12582

Number of uninfected images in test data: 1300
Number of parasitised images in test data: 1300

#setting random seed
np.random.seed(42)
plt.figure(figsize=(12, 12))

#iterating through 36 images
for i in range(1, 37):
    plt.subplot(6, 6, i)

    #random index from train_images
    random_index = np.random.randint(0, train_images.shape[0])

    #get image and label
    image = train_images[random_index]
    label = train_labels[random_index]

    #set title
    title = 'parasitized' if label == 1 else 'uninfected'
    plt.title(title)

    #display image
    plt.imshow(image)

    plt.axis('off')

plt.show()

#set random seed
np.random.seed(42)
plt.figure(figsize=(12, 12))

#visualising 36 random images from test images
for i in range(1, 37):
    plt.subplot(6, 6, i)

    #randomly select index from test_images
    random_index = np.random.randint(0, test_images.shape[0])

    #get image and label
    image = test_images[random_index]
    label = test_labels[random_index]

    #set title
    title = 'parasitized' if label == 1 else 'uninfected'
    plt.title(title)

    #display image
    plt.imshow(image)

    plt.axis('off')

plt.show()

#function to find the mean
def find_mean_image(full_images, title):
    # Calculate average image
    mean_image = np.mean(full_images, axis=0)

    # Plot the average image
    plt.imshow(mean_image)
    plt.title(f'Average {title}')
    plt.axis('off')
    plt.show()

    return mean_image

#creating list to hold parasitised images
parasitised_data = []

#iterate through train images and labels
for img, label in zip(train_images, train_labels):
    #check if label is 1
    if label == 1:
        parasitised_data.append(img)

#convert list to numpy array
parasitised_data = np.array(parasitised_data)

#calculate and plot mean of parasitised images
parasitized_mean = find_mean_image(parasitised_data, 'Parasitized')

#creating list to hold uninfected images
uninfected_data = []

#iterate through train images and labels
for img, label in zip(train_images, train_labels):
    #check if label is 0
    if label != 1:
        uninfected_data.append(img)

#convert list to numpy array
uninfected_data = np.array(uninfected_data)

#calculate and plot mean of uninfected images
uninfected_mean = find_mean_image(uninfected_data, 'Uninfected')

#fixing seed for random number generators
np.random.seed(42)
#setting figsize
plt.figure(figsize=(12, 12))

#creating list to hold the HSV images
train_images_hsv = []

#convert all train images to HSV
for img in train_images:
    hsv_img = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    hsv_img = (hsv_img * 255).astype(np.uint8)
    train_images_hsv.append(hsv_img)

#convert list to a numpy array
train_images_hsv = np.array(train_images_hsv)

for i in range(1, 37):
    plt.subplot(6, 6, i)

    #randomly select index from test_images
    random_index = np.random.randint(0, train_images_hsv.shape[0])

    #get image and label
    image = train_images_hsv[random_index]
    label = train_labels[random_index]

    #set title
    title = 'parasitized' if label == 1 else 'uninfected'
    plt.title(title)

    #display image
    plt.imshow(image)

    plt.axis('off')

plt.show()

np.random.seed(42)
plt.figure(figsize=(12, 12))

#creating list to hold HSV images
test_images_hsv = []

#convert all test images to HSV
for img in test_images:
    hsv_img = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    hsv_img = (hsv_img * 255).astype(np.uint8)
    test_images_hsv.append(hsv_img)

#convert list to numpy array
test_images_hsv = np.array(test_images_hsv)

for i in range(1, 37):
    plt.subplot(6, 6, i)

    #randomly select index from the test_images array
    random_index = np.random.randint(0, test_images_hsv.shape[0])

    #get image and label
    image = test_images_hsv[random_index]
    label = test_labels[random_index]

    #set title
    title = 'parasitized' if label == 1 else 'uninfected'
    plt.title(title)

    #display the image
    plt.imshow(image)

    plt.axis('off')

plt.show()

#encoding train, test labels
train_labels = to_categorical(train_labels, 2)
test_labels = to_categorical(test_labels, 2)

#confirming data shapes before proceeding
print("Shape of Training Data:", train_images_gauss.shape)
print("Shape of Training Labels:", train_labels.shape)
print("Shape of Test Data:", test_images_gauss.shape)
print("Shape of Test Labels:", test_labels.shape)

Shape of Training Data: (24958, 64, 64, 3)
Shape of Training Labels: (24958, 2)
Shape of Test Data: (2600, 64, 64, 3)
Shape of Test Labels: (2600, 2)

#defining function to plot confusion matrix, print classification report
def plot_con_matrix(model, test_images, test_labels):
    # Making predictions
    pred = model.predict(test_images_hsv)
    pred = np.argmax(pred, axis=1)

    # Getting true labels
    y_true = np.argmax(test_labels, axis=1)

    # Printing classification report, setting to 3 decimal places
    print(classification_report(y_true, pred, digits=3))

    # Plotting confusion matrix heatmap
    cm = confusion_matrix(y_true, pred)
    plt.figure(figsize=(8, 5))
    sns.heatmap(cm, annot=True, fmt='.0f', xticklabels=['Uninfected', 'Parasitised'], yticklabels=['Uninfected', 'Parasitised'])
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()

#defining function to plot recall and loss
def plot_recall_and_loss(history):
    #get number of epochs
    epochs = range(1, len(history.history["recall"]) + 1)

    #create figure and axes
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

    #plot recall
    ax1.plot(epochs, history.history["recall"], label="Train Recall", ls='-')
    ax1.plot(epochs, history.history["val_recall"], label="Validation Recall", ls='-')
    ax1.set_title("Recall vs Epoch")
    ax1.set_xlabel("Epochs")
    ax1.set_ylabel("Recall")
    ax1.legend(loc="upper left")

    #plot loss
    ax2.plot(epochs, history.history["loss"], label="Train Loss", ls='-')
    ax2.plot(epochs, history.history["val_loss"], label="Validation Loss", ls='-')
    ax2.set_title("Loss vs Epoch")
    ax2.set_xlabel("Epochs")
    ax2.set_ylabel("Loss")
    ax2.legend(loc="upper right")

    # Show plot
    plt.show()

#clearing keras backend
backend.clear_session()

#fixing seed for random number generators
np.random.seed(42)
tf.random.set_seed(42)

#defining model and layers
malaria_model = Sequential([
    #convolutional layers with same padding to maintain input size,
    #leakyrelu to introduce non-linearity
    Conv2D(32, (2, 2), padding='same', activation=LeakyReLU(alpha=0.2), input_shape=(64, 64, 3)),
    #normalising with batch normalisation
    BatchNormalization(),
    #downsampling with max pooling
    MaxPooling2D((2, 2)),
    #regularising with dropout
    Dropout(0.2),

    #repeating convolutional layer pattern
    Conv2D(32, (2, 2), padding='same', activation=LeakyReLU(alpha=0.2)),
    BatchNormalization(),
    MaxPooling2D((2, 2)),
    Dropout(0.2),

    Conv2D(32, (2, 2), padding='same', activation=LeakyReLU(alpha=0.2)),
    BatchNormalization(),
    MaxPooling2D((2, 2)),
    Dropout(0.2),

    Conv2D(32, (2, 2), padding='same', activation=LeakyReLU(alpha=0.2)),
    BatchNormalization(),
    MaxPooling2D((2, 2)),
    Dropout(0.2),

    Conv2D(32, (2, 2), padding='same', activation=LeakyReLU(alpha=0.2)),
    BatchNormalization(),
    MaxPooling2D((2, 2)),
    Dropout(0.2),

    Conv2D(32, (2, 2), padding='same', activation=LeakyReLU(alpha=0.2)),
    BatchNormalization(),
    MaxPooling2D((2, 2)),
    Dropout(0.2),

    #converting to 1D vector with flatten
    Flatten(),

    #fully connected layers
    #connecting neurons from previous layers with dense
    Dense(512, activation=LeakyReLU(alpha=0.2)),
    #regularising with dropout
    Dropout(0.4),
    #using sigmoid activation on final dense layer for binary classification
    Dense(2, activation='sigmoid')
])

#calling summary of model
malaria_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 conv2d (Conv2D)             (None, 64, 64, 32)        416       
                                                                 
 batch_normalization (Batch  (None, 64, 64, 32)        128       
 Normalization)                                                  
                                                                 
 max_pooling2d (MaxPooling2  (None, 32, 32, 32)        0         
 D)                                                              
                                                                 
 dropout (Dropout)           (None, 32, 32, 32)        0         
                                                                 
 conv2d_1 (Conv2D)           (None, 32, 32, 32)        4128      
                                                                 
 batch_normalization_1 (Bat  (None, 32, 32, 32)        128       
 chNormalization)                                                
                                                                 
 max_pooling2d_1 (MaxPoolin  (None, 16, 16, 32)        0         
 g2D)                                                            
                                                                 
 dropout_1 (Dropout)         (None, 16, 16, 32)        0         
                                                                 
 conv2d_2 (Conv2D)           (None, 16, 16, 32)        4128      
                                                                 
 batch_normalization_2 (Bat  (None, 16, 16, 32)        128       
 chNormalization)                                                
                                                                 
 max_pooling2d_2 (MaxPoolin  (None, 8, 8, 32)          0         
 g2D)                                                            
                                                                 
 dropout_2 (Dropout)         (None, 8, 8, 32)          0         
                                                                 
 conv2d_3 (Conv2D)           (None, 8, 8, 32)          4128      
                                                                 
 batch_normalization_3 (Bat  (None, 8, 8, 32)          128       
 chNormalization)                                                
                                                                 
 max_pooling2d_3 (MaxPoolin  (None, 4, 4, 32)          0         
 g2D)                                                            
                                                                 
 dropout_3 (Dropout)         (None, 4, 4, 32)          0         
                                                                 
 conv2d_4 (Conv2D)           (None, 4, 4, 32)          4128      
                                                                 
 batch_normalization_4 (Bat  (None, 4, 4, 32)          128       
 chNormalization)                                                
                                                                 
 max_pooling2d_4 (MaxPoolin  (None, 2, 2, 32)          0         
 g2D)                                                            
                                                                 
 dropout_4 (Dropout)         (None, 2, 2, 32)          0         
                                                                 
 conv2d_5 (Conv2D)           (None, 2, 2, 32)          4128      
                                                                 
 batch_normalization_5 (Bat  (None, 2, 2, 32)          128       
 chNormalization)                                                
                                                                 
 max_pooling2d_5 (MaxPoolin  (None, 1, 1, 32)          0         
 g2D)                                                            
                                                                 
 dropout_5 (Dropout)         (None, 1, 1, 32)          0         
                                                                 
 flatten (Flatten)           (None, 32)                0         
                                                                 
 dense (Dense)               (None, 512)               16896     
                                                                 
 dropout_6 (Dropout)         (None, 512)               0         
                                                                 
 dense_1 (Dense)             (None, 2)                 1026      
                                                                 
=================================================================
Total params: 39746 (155.26 KB)
Trainable params: 39362 (153.76 KB)
Non-trainable params: 384 (1.50 KB)
_________________________________________________________________

#instantiating adam optimiser
adam = Adam(learning_rate=0.001)

#compiling model
malaria_model.compile(loss = 'binary_crossentropy', optimizer = adam, metrics = [tf.keras.metrics.Recall()])

#assigning early stopping and model checkpoint callbacks
callbacks = [EarlyStopping(monitor = 'val_loss', patience = 3),
             ModelCheckpoint('.mdl_wts.hdf5', monitor = 'val_loss', save_best_only = True)]

#fitting and training model
malaria_model_hist = malaria_model.fit(train_images_hsv, train_labels, batch_size = 32,
                           callbacks = callbacks, validation_split = 0.2, epochs = 20, verbose = 1)

Epoch 1/20
624/624 [==============================] - 13s 14ms/step - loss: 0.3096 - recall: 0.8599 - val_loss: 0.1309 - val_recall: 0.9655
Epoch 2/20
624/624 [==============================] - 7s 12ms/step - loss: 0.1094 - recall: 0.9639 - val_loss: 0.0681 - val_recall: 0.9838
Epoch 3/20
624/624 [==============================] - 7s 12ms/step - loss: 0.0913 - recall: 0.9705 - val_loss: 0.0575 - val_recall: 0.9800
Epoch 4/20
624/624 [==============================] - 8s 13ms/step - loss: 0.0846 - recall: 0.9721 - val_loss: 0.0463 - val_recall: 0.9850
Epoch 5/20
624/624 [==============================] - 7s 12ms/step - loss: 0.0770 - recall: 0.9756 - val_loss: 0.0593 - val_recall: 0.9800
Epoch 6/20
624/624 [==============================] - 8s 13ms/step - loss: 0.0757 - recall: 0.9750 - val_loss: 0.0533 - val_recall: 0.9800
Epoch 7/20
624/624 [==============================] - 7s 11ms/step - loss: 0.0697 - recall: 0.9763 - val_loss: 0.0458 - val_recall: 0.9832
Epoch 8/20
624/624 [==============================] - 8s 13ms/step - loss: 0.0682 - recall: 0.9778 - val_loss: 0.0632 - val_recall: 0.9774
Epoch 9/20
624/624 [==============================] - 8s 13ms/step - loss: 0.0705 - recall: 0.9776 - val_loss: 0.0532 - val_recall: 0.9788
Epoch 10/20
624/624 [==============================] - 7s 12ms/step - loss: 0.0657 - recall: 0.9779 - val_loss: 0.0456 - val_recall: 0.9838
Epoch 11/20
624/624 [==============================] - 8s 13ms/step - loss: 0.0618 - recall: 0.9802 - val_loss: 0.0537 - val_recall: 0.9804
Epoch 12/20
624/624 [==============================] - 7s 11ms/step - loss: 0.0617 - recall: 0.9797 - val_loss: 0.0399 - val_recall: 0.9860
Epoch 13/20
624/624 [==============================] - 8s 13ms/step - loss: 0.0616 - recall: 0.9791 - val_loss: 0.0466 - val_recall: 0.9818
Epoch 14/20
624/624 [==============================] - 7s 12ms/step - loss: 0.0633 - recall: 0.9789 - val_loss: 0.0559 - val_recall: 0.9752
Epoch 15/20
624/624 [==============================] - 8s 13ms/step - loss: 0.0623 - recall: 0.9794 - val_loss: 0.0541 - val_recall: 0.9790

#evaluating for recall
malaria_model_eval = malaria_model.evaluate(test_images_hsv, test_labels, verbose = 1)
print('\n', 'Test Recall:', malaria_model_eval[1])

82/82 [==============================] - 1s 4ms/step - loss: 0.0446 - recall: 0.9842

 Test Recall: 0.9842307567596436

#printing classification report and plotting confusion matrix
plot_con_matrix(malaria_model,test_images_hsv,test_labels)

82/82 [==============================] - 0s 3ms/step
              precision    recall  f1-score   support

           0      0.988     0.980     0.984      1300
           1      0.980     0.988     0.984      1300

    accuracy                          0.984      2600
   macro avg      0.984     0.984     0.984      2600
weighted avg      0.984     0.984     0.984      2600

#plotting recall and loss for model 1
plot_recall_and_loss(malaria_model_hist)

#saving malaria model
malaria_model.save("malaria_detection_model.keras")

Malaria Detection¶

Executive Summary

Problem Summary¶

Solution Design¶

Mounting Drive¶

Loading libraries¶

Loading the data¶

Inputting data to lists¶

Check the shape of train and test images¶

Check the shape of train and test labels¶

Observations and insights:¶

Check the minimum and maximum range of pixel values for train and test images

Observations and insights:¶

Count the number of values in uninfected and parasitised¶

Normalise the images¶

Observations and insights:¶

Plot to check if the data is balanced¶

Observations and insights:¶

Visualising the Training Images¶

Observations and insights:¶

Visualising the Test Images¶

Observations and insights:¶

Plotting the mean images for parasitised and uninfected¶

Mean image for parasitised¶

Mean image for uninfected¶

Observations and insights:¶

Converting the train data from RGB to HSV¶

Converting the test data from RGB to HSV¶

Observations and insights:¶

One Hot Encoding the train and test labels¶

Defining Functions for Evaluation¶

Building the Model¶

Compiling the model¶

Using Callbacks¶

Fit and Train the Model¶

Evaluating the model

Plotting the confusion matrix¶

Plotting the recall and loss curves¶

Analysis and Key Insights¶

Recommendations for Implementation¶

References¶