Deploy Data Science

All things Data Science and ML Ops

27 May 2021

Intro to PyTorch for Deep Learning

PyTorch

Pytorch is an open source machine learning framework that accelerates the path from research prototyping to production deployment.

Regression Problem Example

Boston Housing Dataset

I have used the Boston housing dataset as it is one of the basic and beginner friendly dataset on Kaggle

Python Implementation

The implementation creates a simple neural network using pytorch and compares with a baseline linear regression model.

Learnings

Some key lessons learnt are to start with simple datasets and compare with baseline model to see the difference of using a non linear type model. Data has to be preprocessed into tensors and chunked into batches and then passed to training.

Caveats and Gotchas

Loss Function in pytorch requires to have 2 dimensional tensor for the Y (independent) variable. E.g., for tensor of size 100 the tensor shape should be (100, 1)

import imageio
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
import torch.utils.data as Data
from torch import tensor
from torch.autograd import Variable

from sklearn.metrics import mean_squared_error
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler
torch.manual_seed(1)    # reproducible

datadir = './data/'
# get data from kaggle
df = pd.read_csv(f'{datadir}/boston_train.csv')

y = df['medv'].to_numpy()
x = df[['crim', 'zn', 'indus', 'chas', 'nox', 'age', 'dis', 'rad', 'tax',
        'ptratio', 'black', 'rm', 'lstat']].to_numpy()

# scaling may be enabled not necessary though
# scaler = MinMaxScaler()
# x = scaler.fit_transform(x.astype(np.float32))

x, y = tensor(x.astype(np.float32)), tensor(y.astype(np.float32))
# torch can only train on Variable, so convert them to Variable
x, y = Variable(x), Variable(y)

# another way to define a network
# Linear layers do a linear tranformation i.e matrix multiplication
# with a constant (bias)
# LeakyRelu is an activativation which does non linear transformation
# Input here 13 is number of features in data
net = torch.nn.Sequential(
    torch.nn.Linear(13, 200),
    torch.nn.LeakyReLU(),
    torch.nn.Linear(200, 100),
    torch.nn.LeakyReLU(),
    torch.nn.Linear(100, 1),
)

LR = 0.001

optimizer = torch.optim.Adam(net.parameters(), lr=LR)
# this is for regression mean squared loss
loss_func = torch.nn.MSELoss(reduce='mean')

BATCH_SIZE = 50
EPOCH = 300

torch_dataset = Data.TensorDataset(x, y)

loader = Data.DataLoader(
    dataset=torch_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=True, num_workers=4,)


def get_mse(model):
    """
    Return the accuracy of the model on the input data and actual ground truth.
    """
    prediction = net(x)     # input x and predict based on x
    return mean_squared_error(y.data.numpy(),  prediction.data.numpy())


epoch_losses = []
training_accuracy = []
# start training
for epoch in range(EPOCH):
    losses = []
    # for each training step
    # steps are equal to total rows / batch size
    for step, (batch_x, batch_y) in enumerate(loader): 
        b_x = Variable(batch_x)
        b_y = Variable(batch_y)
    
        prediction = net(b_x)     # input x and predict based on x

        # must be (1. nn output, 2. target)
        # the target must be 2 dimensional array otherwise model
        # predicts same values
        loss = loss_func(prediction, torch.unsqueeze(b_y, dim=1))

        optimizer.zero_grad()   # clear gradients for next train
        loss.backward()         # backpropagation, compute gradients
        optimizer.step()        # apply gradients

        # save the current training information
        losses.append(float(loss))
    
    epoch_losses.append(sum(losses)/step)
    training_accuracy.append(get_mse(net))

    print(
        f"Epoch #{epoch+1}\tLoss: {epoch_losses[-1]:.3f}\t MSE: {training_accuracy[-1]}")


# plotting
iters = [i for i in range(EPOCH)]
plt.plot(iters, epoch_losses)
plt.title(f"Training Curve (batch_size={BATCH_SIZE}, lr={LR})")
plt.xlabel("Iterations")
plt.ylabel("Loss")
plt.show()

plt.plot(iters, training_accuracy)
plt.title(f"Training Error (batch_size={BATCH_SIZE}, lr={LR})")
plt.xlabel("Iterations")
plt.ylabel("Training Accuracy")
plt.show()


prediction = net(x)     # input x and predict based on x
mean_squared_error(y.data.numpy(),  prediction.data.numpy())


# Compare with Linear regression baseline
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(x, y)
prediction = lm.predict(x)
mean_squared_error(y,  prediction)