Let's try to develop an intuition on how we can tune a given neural network using dropout layers. We choose a binary classification problem to study the optimization approach. The process is largely emprical, however we need to have an understanding of overfitting and regularization in order to fully embrace the concept.
# Loading training set, contains 100 engineered features and about 800,000 training samples, saved as a sparse matrix
import pickle
with open("X_train_balanced_trans_100.pkl","rb") as f:
X_train_balanced_trans_100 = pickle.load(f)
with open("y_train_balanced.pkl", "rb") as f:
y_train_balanced = pickle.load(f)
Start with a 'medium-complexity' model to develop expectations:
from keras import models, metrics, layers
# Note that csr type of sparse matrix runs significantly faster in keras neural network implementation
network1 = models.Sequential()
network1.add(layers.Dense(32,activation="relu", input_shape = (X_train_balanced_trans_100.shape[1],)))
network1.add(layers.Dense(32,activation="relu"))
network1.add(layers.Dense(32,activation="relu"))
network1.add(layers.Dense(1,activation= "sigmoid"))
network1.compile(optimizer= "adam", loss= "binary_crossentropy", metrics= ["acc"])
history_net1 = network1.fit(X_train_balanced_trans_100.tocsr(),y_train_balanced,
epochs=20,batch_size=500,validation_split= 0.5)
Now reduce the complexity to monitor performance:
network1 = models.Sequential()
network1.add(layers.Dense(16,activation="relu", input_shape = (X_train_balanced_trans_100.shape[1],)))
network1.add(layers.Dense(16,activation="relu"))
network1.add(layers.Dense(16,activation="relu"))
network1.add(layers.Dense(1,activation= "sigmoid"))
network1.compile(optimizer= "adam", loss= "binary_crossentropy", metrics= ["acc"])
history_net1 = network1.fit(X_train_balanced_trans_100.tocsr(),y_train_balanced,
epochs=40,batch_size=200,validation_split= 0.5)
import matplotlib.pyplot as plt
epochs = list(range(1,len(history_net1.history["loss"]) +1))
print("Min loss: " + str(min(history_net1.history["loss"])) + " at epoch " + str(history_net1.history["loss"].index(min(history_net1.history["loss"])) + 1))
print("Min val loss: " + str(min(history_net1.history["val_loss"]))+ " at epoch " + str(history_net1.history["val_loss"].index(min(history_net1.history["val_loss"])) + 1))
plt.plot(epochs,history_net1.history["loss"],marker = "o", color = "b", label = "loss")
plt.plot(epochs,history_net1.history["val_loss"],marker = "o", color = "r", label = "val_loss")
plt.legend(fontsize = 20)
plt.xlabel("Number of epochs",fontsize=20)
plt.ylabel("Loss",fontsize=20)
plt.title("Monitoring neural network performance",fontsize=25)
plt.show()
Note how the training loss keep reducing but validation loss (i.e.: out-of-the-box performance of the model) does not behave in the same way. After the epoch 35, the network starts overfitting to training set.
We are going to apply dropout regularization approach between hidden layers of the network. This approach randomly drops neurons (at a specified dropout rate) for a given layer of the network at each learning cycle. This results in removal of these neurons, masking contribution of their weights to the final prediction. Nearby neurons are expected to compansate for the impact of dropped weights to achieve the same loss in the cost function. This process, when used properly, leads to a better generalization of the model and help reducing the impact of overfitting.
# Add dropout between hidden layers
from keras import models, metrics, layers
import matplotlib.pyplot as plt
network1 = models.Sequential()
network1.add(layers.Dense(16,activation="relu", input_shape = (X_train_balanced_trans_100.shape[1],)))
network1.add(layers.Dropout(0.2))
network1.add(layers.Dense(16,activation="relu"))
network1.add(layers.Dropout(0.2))
network1.add(layers.Dense(16,activation="relu"))
network1.add(layers.Dropout(0.2))
network1.add(layers.Dense(8,activation="relu"))
network1.add(layers.Dropout(0.2))
network1.add(layers.Dense(1,activation= "sigmoid"))
network1.compile(optimizer= "adam", loss= "binary_crossentropy", metrics= ["acc"])
history_net1 = network1.fit(X_train_balanced_trans_100.tocsr(),y_train_balanced,
epochs=100,batch_size=200,validation_split= 0.5)
epochs = list(range(1,len(history_net1.history["loss"]) +1))
print("Min loss: " + str(min(history_net1.history["loss"])) + " at epoch " + str(history_net1.history["loss"].index(min(history_net1.history["loss"])) + 1))
print("Min val loss: " + str(min(history_net1.history["val_loss"]))+ " at epoch " + str(history_net1.history["val_loss"].index(min(history_net1.history["val_loss"])) + 1))
plt.plot(epochs,history_net1.history["loss"],marker = "o", color = "b", label = "loss")
plt.plot(epochs,history_net1.history["val_loss"],marker = "o", color = "r", label = "val_loss")
plt.legend(fontsize = 20)
plt.xlabel("Number of epochs",fontsize=20)
plt.ylabel("Loss",fontsize=20)
plt.title("Monitoring neural network performance",fontsize=25)
plt.show()
# decrease dropout rate (model will fit stronger to training set again)
from keras import models, metrics, layers
import matplotlib.pyplot as plt
# Note that csr type of sparse matrix runs significantly faster in keras neural network implementation
network1 = models.Sequential()
network1.add(layers.Dense(16,activation="relu", input_shape = (X_train_balanced_trans_100.shape[1],)))
network1.add(layers.Dropout(0.001))
network1.add(layers.Dense(16,activation="relu"))
network1.add(layers.Dropout(0.001))
network1.add(layers.Dense(16,activation="relu"))
network1.add(layers.Dropout(0.001))
network1.add(layers.Dense(8,activation="relu"))
network1.add(layers.Dropout(0.001))
network1.add(layers.Dense(1,activation= "sigmoid"))
network1.compile(optimizer= "adam", loss= "binary_crossentropy", metrics= ["acc"])
history_net1 = network1.fit(X_train_balanced_trans_100.tocsr(),y_train_balanced,
epochs=100,batch_size=200,validation_split= 0.5)
epochs = list(range(1,len(history_net1.history["loss"]) +1))
print("Min loss: " + str(min(history_net1.history["loss"])) + " at epoch " + str(history_net1.history["loss"].index(min(history_net1.history["loss"])) + 1))
print("Min val loss: " + str(min(history_net1.history["val_loss"]))+ " at epoch " + str(history_net1.history["val_loss"].index(min(history_net1.history["val_loss"])) + 1))
plt.plot(epochs,history_net1.history["loss"],marker = "o", color = "b", label = "loss")
plt.plot(epochs,history_net1.history["val_loss"],marker = "o", color = "r", label = "val_loss")
plt.legend(fontsize = 20)
plt.xlabel("Number of epochs",fontsize=20)
plt.ylabel("Loss",fontsize=20)
plt.title("Monitoring neural network performance",fontsize=25)
plt.show()
epochs = list(range(1,len(history_net1.history["loss"]) +1))
print("Min loss: " + str(min(history_net1.history["loss"])) + " at epoch " + str(history_net1.history["loss"].index(min(history_net1.history["loss"])) + 1))
print("Min val loss: " + str(min(history_net1.history["val_loss"]))+ " at epoch " + str(history_net1.history["val_loss"].index(min(history_net1.history["val_loss"])) + 1))
plt.plot(epochs,history_net1.history["loss"],marker = "o", color = "b", label = "loss")
plt.plot(epochs,history_net1.history["val_loss"],marker = "o", color = "r", label = "val_loss")
plt.legend(fontsize = 20)
plt.xlabel("Number of epochs",fontsize=20)
plt.ylabel("Loss",fontsize=20)
plt.title("Monitoring neural network performance",fontsize=25)
plt.show()
Now we have an intuition about the impact of dropout regularization on the existing network. We are familiar with the performance of the model, with and without regularization, so we can try increasing complexity to overfit once again.
# Increase model complexity to overfit
from keras import models, metrics, layers
import matplotlib.pyplot as plt
# Note that csr type of sparse matrix runs significantly faster in keras neural network implementation
network1 = models.Sequential()
network1.add(layers.Dense(16,activation="relu", input_shape = (X_train_balanced_trans_100.shape[1],)))
network1.add(layers.Dropout(0.001))
network1.add(layers.Dense(32,activation="relu"))
network1.add(layers.Dropout(0.001))
network1.add(layers.Dense(16,activation="relu"))
network1.add(layers.Dropout(0.001))
network1.add(layers.Dense(8,activation="relu"))
network1.add(layers.Dropout(0.001))
network1.add(layers.Dense(1,activation= "sigmoid"))
network1.compile(optimizer= "adam", loss= "binary_crossentropy", metrics= ["acc"])
history_net1 = network1.fit(X_train_balanced_trans_100.tocsr(),y_train_balanced,
epochs=100,batch_size=200,validation_split= 0.5)
epochs = list(range(1,len(history_net1.history["loss"]) +1))
print("Min loss: " + str(min(history_net1.history["loss"])) + " at epoch " + str(history_net1.history["loss"].index(min(history_net1.history["loss"])) + 1))
print("Min val loss: " + str(min(history_net1.history["val_loss"]))+ " at epoch " + str(history_net1.history["val_loss"].index(min(history_net1.history["val_loss"])) + 1))
plt.plot(epochs,history_net1.history["loss"],marker = "o", color = "b", label = "loss")
plt.plot(epochs,history_net1.history["val_loss"],marker = "o", color = "r", label = "val_loss")
plt.legend(fontsize = 20)
plt.xlabel("Number of epochs",fontsize=20)
plt.ylabel("Loss",fontsize=20)
plt.title("Monitoring neural network performance",fontsize=25)
plt.show()
Overfitting after epoch 6 is obvious. Now, let's apply our dropout regularization trick once again:
# Increase Dropout rate
from keras import models, metrics, layers
import matplotlib.pyplot as plt
# Note that csr type of sparse matrix runs significantly faster in keras neural network implementation
network1 = models.Sequential()
network1.add(layers.Dense(16,activation="relu", input_shape = (X_train_balanced_trans_100.shape[1],)))
network1.add(layers.Dropout(0.1))
network1.add(layers.Dense(32,activation="relu"))
network1.add(layers.Dropout(0.1))
network1.add(layers.Dense(16,activation="relu"))
network1.add(layers.Dropout(0.1))
network1.add(layers.Dense(8,activation="relu"))
network1.add(layers.Dropout(0.1))
network1.add(layers.Dense(1,activation= "sigmoid"))
network1.compile(optimizer= "adam", loss= "binary_crossentropy", metrics= ["acc"])
history_net1 = network1.fit(X_train_balanced_trans_100.tocsr(),y_train_balanced,
epochs=100,batch_size=200,validation_split= 0.5)
epochs = list(range(1,len(history_net1.history["loss"]) +1))
print("Min loss: " + str(min(history_net1.history["loss"])) + " at epoch " + str(history_net1.history["loss"].index(min(history_net1.history["loss"])) + 1))
print("Min val loss: " + str(min(history_net1.history["val_loss"]))+ " at epoch " + str(history_net1.history["val_loss"].index(min(history_net1.history["val_loss"])) + 1))
plt.plot(epochs,history_net1.history["loss"],marker = "o", color = "b", label = "loss")
plt.plot(epochs,history_net1.history["val_loss"],marker = "o", color = "r", label = "val_loss")
plt.legend(fontsize = 20)
plt.xlabel("Number of epochs",fontsize=20)
plt.ylabel("Loss",fontsize=20)
plt.title("Monitoring neural network performance",fontsize=25)
plt.show()
Notice the boost in network performance? By increasing regularization following an overfitted model provides us a model that is able to predict better than the benchmark model, yet it also helped us to avoid overfitting we observed.
Therefore, the take home message from this exercise is:
By performing a few emprical cycles involving these steps, you might be able to approach a better-tuned neural network and advance your Deep Learning journey!