# made with tutorials at https://pythonprogramming.net/data-deep-learning-neural-network-pytorch/
import torch
import torchvision
from torchvision import transforms, datasets

import torch.nn as nn
import torch.nn.functional as F

import torch.optim as optim

class Net(nn.Module):
    def __init__(self):
        super().__init__()
        # define the first layer. 
        # 28*28 input neurons because we will flatten each 28 by 28 image
        # 64 output neurons because why not
        self.fc1 = nn.Linear(28*28, 64)

        # next layers
        # in 64 - out 64
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 64)

        # output layer
        # 64 inputs from the previous layer
        # 10 outputs - 1 for each output label - digits from 0 to 9
        self.fc4 = nn.Linear(64, 10)
    
    # defines how feed-forward works
    def forward(self, x):
        # apply the relu ( _/ ) activation function to each output of each layer
        # feed the output of each layer to the input of the next
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))

        # the output layer
        x = self.fc4(x)

        # normalize the 10 output values so they sum to 1
        x = F.log_softmax(x, dim=1)
        # so that we will aim for output of:
        # [0, 0, 1, 0, 0, 0, 0, 0, 0, 0]
        # if the image was a drawing of 2

        return x

train = datasets.MNIST('', train=True, download=True,
                       transform=transforms.Compose([
                           transforms.ToTensor()
                       ]))

test = datasets.MNIST('', train=False, download=True,
                       transform=transforms.Compose([
                           transforms.ToTensor()
                       ]))

trainset = torch.utils.data.DataLoader(train, batch_size=10, shuffle=True)
testset = torch.utils.data.DataLoader(test, batch_size=10, shuffle=False)

net = Net()

optimizer = optim.Adam(net.parameters(), lr=0.001)

# an epoch is one passing through all the training data
EPOCHS = 3

for epoch in range(EPOCHS):
    for data in trainset:
        # data is a batch (of 10 in our case) of featuresets (inputs) and labels (expected outputs)
        X, y = data

        # the gradient needs to be zeroed after each batch ???
        net.zero_grad()
        # the optimizer uses those gradients to optimize weights

        # feed forward
        output = net(X.view(-1, 28*28))

        # we're aiming for the output to be a one hot vector (one 1 and the rest 0s)
        loss = F.nll_loss(output, y)

        # back propagate the loss
        loss.backward()

        # adjust the weights
        optimizer.step()

    print("Epoch", epoch, "done")
print()

# Checking the accuracy
correct = 0
total = 0

with torch.no_grad():
    for data in testset:
        X, y = data
        output = net(X.view(-1,784))
        #print(output)
        for idx, i in enumerate(output):
            #print(torch.argmax(i), y[idx])
            if torch.argmax(i) == y[idx]:
                correct += 1
            total += 1

print("Accuracy: ", round(correct/total, 3))
print()

# Testing on one test example

for data in testset:
    X, y = data
    break

print('Input:')
for row in X[0][0]:
    for val in row:
        if val > 0.4:
            print('##', end='')
        else:
            print('  ', end='')
    print()

print('Prediction:')
print(torch.argmax(net(X[0].view(-1, 28*28))[0]))