Implementation of layers in NN (part 1)
The first article will discuss the implementation of the module Sequential and layers Dense and Input via the Numpy library for multilayer NN.
This article focuses on the practical implementation of layers with a minimum of theory, and it is assumed that the reader is familiar with the basic theory of learning neural networks.
Let’s start with importing libraries:
import numpy as np
Implementation of the Dense layer
class DenseLayer():
def __init__(self, units=1, activation='relu', weights=np.array([]), b=np.array([])):
self.units = units
self.fl_init = True
self.activation = activation
self.weights = weights
self.b_new = b
self.w, self.b = np.array([]), np.array([])
I will explain the parameters:
-
units – the number of neurons
-
activation – activation function
-
weights and b_new – adjusted weights and offsets that we will pass to the model in the future
-
w, b – Initial weights and offsets
-
fl_init – a flag indicating whether initial weights and offsets have been created
Next, we will use the magic call method to turn our class into a functor:
def __call__(self, x):
if (self.fl_init == True) and (self.weights.shape[0] == 0):
self.w = np.random.normal(loc=0.0, scale=1.0, size=(x.shape[-1], self.units))/np.sqrt(2.0/x.shape[-1])
self.b = np.ones(shape=(self.units, ), dtype=np.float32)
self.fl_init = False
# print(self.w.shape, self.weights)
elif self.weights.shape[0] != 0:
self.weights = self.weights.reshape((x.shape[-1], self.units))
self.w = self.weights
self.fl_init = False
self.b_new = self.b_new.reshape((self.units, ))
self.b = self.b_new
self.fl_init = False
y = x.dot(self.w) + self.b
if self.activation == 'relu':
return np.maximum(np.zeros(shape=y.shape), y), self.w, self.b, 1, self.units, self.activation
if self.activation == 'Leaky_relu':
return np.maximum(0.01*y, y), self.w, self.b, 1, self.units, self.activation
if self.activation == 'softmax':
return np.exp(y)/np.sum(np.exp(y), axis=0), self.w, self.b, 1, self.units, self.activation
if self.activation == 'sigmoid':
return 1 / (1 + np.exp(-y)), self.w, self.b, 1, self.units, self.activation
if self.activation == 'tanh':
return (np.exp(2*y) - 1)/(np.exp(2*y) + 1), self.w, self.b, 1, self.units, self.activation
if self.activation == 'linear':
return y, self.w, self.b, 1, self.units, self.activation
The principle of operation is as follows:
-
First, there is a check that the initial weights have not yet been created and the adjusted ones have not been transferred to us.
-
If the condition is met, then the initial weights are created with a normal distribution (mathematical expectation = 0, variance = 1), and the initial offsets are set as units -> Set the flag to False.
-
If the adjusted weights were transferred, we replace the initial weights with them.
-
We calculate y
-
We skip it through the transferred activation function (if not transferred, then we skip it through ‘relu‘)
We implement a simple Input class:
class Input():
def __init__(self, shape=None):
self.shape = shape
def __call__(self, x):
if self.shape is not None:
if x.shape != self.shape:
return x.reshape(shape=self.shape), 0
else:
return x, 0
return x, 0
Let’s move on to writing the module Sequential:
class Sequential():
def __init__(self, layers):
self.layers = layers # слои в NN
Let’s start with the method fits:
First, we implement the auxiliary function predict, which will return for each layer the outputs, activation functions, weights and biases, the layer used, and the number of neurons. We will need it later for the method back propagation error(BP).
def predict(x):
activations = []
predict_for_layers = []
weights = []
b_coef = []
layer_2 = []
units = []
predict = self.layers[0](x)
layer_2.append(predict[1])
predict_for_layers.append(predict[0])
for i in range(1, len(self.layers)):
predict = self.layers[i](predict[0])
activations.append(predict[-1])
predict_for_layers.append(predict[0])
weights.append(predict[1])
b_coef.append(predict[2])
layer_2.append(predict[3])
units.append(predict[4])
#print(len(units))
return predict_for_layers, activations, weights, b_coef, layer_2, units
Next, we implement the gradient calculation functions:
def sigmoid_gradient(output):
return output * (1 - output)
def tanh_gradient(out):
return 1/((np.exp(out) + np.exp(-out)/2)**2)
def relu_gradient(x):
return (x > 0) * 1
def leaky_relu_gradient(x):
return (x > 0) * 1 + (x <= 0) * 0.01
def linear_gradient(x):
return 1
Let’s move on to the implementation itself Backpropagation:
list_back = self.layers[::-1]
for elem in range(x_input.shape[0]):
x, y = x_input[elem].reshape(1, -1), y_input[elem]
for epoch in range(epochs):
predict_layers = predict(x) # 1 - y, 2 - w, 3 - b, 4 - слой, 5 - кол. нейронов
predict_for_layers, activations, weights, b_coef, layers = predict_layers[0][::-1], predict_layers[1][::-1], predict_layers[2][::-1], predict_layers[3][::-1], predict_layers[4]
units = predict_layers[5]
layer_error = predict_for_layers[0] - y
if len(layer_error.shape) == 1:
layer_error = layer_error.reshape(1, -1)
for ind in range(len(list_back) - 1):
delta_weights = 0
if activations[ind] == 'linear':
delta_weights = layer_error * relu_gradient(predict_for_layers[ind])
if activations[ind] == 'Leaky_relu':
delta_weights = layer_error * leaky_relu_gradient(predict_for_layers[ind])
if activations[ind] == 'relu':
delta_weights = layer_error * relu_gradient(predict_for_layers[ind])
if activations[ind] == 'sigmoid':
delta_weights = layer_error * sigmoid_gradient(predict_for_layers[ind])
if activations[ind] == 'tanh':
delta_weights = layer_error * tanh_gradient(predict_for_layers[ind])
b_coef[ind] -= alpha * (np.full(b_coef[ind].shape, layer_error.sum()))
layer_error = delta_weights.dot(np.transpose(weights[ind]))
weights[ind] -= alpha * (np.transpose(predict_for_layers[ind + 1]).dot(delta_weights))
weights_inp = weights[::-1]
b_inp = b_coef[::-1]
activations_inp = activations[::-1]
for indx in range(1, len(self.layers)):
if layers[indx] == 1:
self.layers[indx] = DenseLayer(units=units[indx - 1], weights=weights_inp[indx - 1], b=b_inp[indx - 1], activation=activations_inp[indx - 1])
I will describe the principle of operation:
-
a pair of elements is taken – label + input.
-
In the cycle by the number of epochs:
-
Using the previously written function predict the output is considered (the lists are flipped to start from the last layer).
-
We consider an error on the last layer.
-
delta_weights – derivative, falsely weighted – (we reduce the errors of predictions made with high confidence. If the slope of the tangent line (the value of the derivative) was small, then the network contains either a very large or a very small value) – we consider the local gradient.
-
Then we rewrite layer_error.
-
We update the weights according to the following rule:
-
-
We go through the layers and rewrite them with new weights and offsets.
It remains to write a function predict without auxiliary outputs:
def predict(self, x):
predict = self.layers[0](x)
for i in range(1, len(self.layers)):
predict = self.layers[i](predict[0])
return predict
The code is complete:
class Sequential():
def __init__(self, layers):
self.layers = layers
def fit(self, x_input, y_input, epochs=50, alpha=0.01):
def predict(x):
activations = []
predict_for_layers = []
weights = []
b_coef = []
layer_2 = []
units = []
predict = self.layers[0](x)
layer_2.append(predict[1])
predict_for_layers.append(predict[0])
for i in range(1, len(self.layers)):
predict = self.layers[i](predict[0])
activations.append(predict[-1])
predict_for_layers.append(predict[0])
weights.append(predict[1])
b_coef.append(predict[2])
layer_2.append(predict[3])
units.append(predict[4])
return predict_for_layers, activations, weights, b_coef, layer_2, units
def sigmoid_gradient(output):
return output * (1 - output)
def tanh_gradient(out):
return 1/((np.exp(out) + np.exp(-out)/2)**2)
def relu_gradient(x):
return (x > 0) * 1
def leaky_relu_gradient(x):
return (x > 0) * 1 + (x <= 0) * 0.01
def linear_gradient(x):
return 1
list_back = self.layers[::-1]
for elem in range(x_input.shape[0]):
x, y = x_input[elem].reshape(1, -1), y_input[elem]
for epoch in range(epochs):
predict_layers = predict(x) # 1 - y, 2 - w, 3 - b, 4 - слой, 5 - кол. нейронов
predict_for_layers, activations, weights, b_coef, layers = predict_layers[0][::-1], predict_layers[1][::-1], predict_layers[2][::-1], predict_layers[3][::-1], predict_layers[4]
units = predict_layers[5]
layer_error = predict_for_layers[0] - y
if len(layer_error.shape) == 1:
layer_error = layer_error.reshape(1, -1)
for ind in range(len(list_back) - 1):
delta_weights = 0
if activations[ind] == 'linear':
delta_weights = layer_error * relu_gradient(predict_for_layers[ind])
if activations[ind] == 'Leaky_relu':
delta_weights = layer_error * leaky_relu_gradient(predict_for_layers[ind])
if activations[ind] == 'relu':
delta_weights = layer_error * relu_gradient(predict_for_layers[ind])
if activations[ind] == 'sigmoid':
delta_weights = layer_error * sigmoid_gradient(predict_for_layers[ind])
if activations[ind] == 'tanh':
delta_weights = layer_error * tanh_gradient(predict_for_layers[ind])
b_coef[ind] -= alpha * (np.full(b_coef[ind].shape, layer_error.sum()))
layer_error = delta_weights.dot(np.transpose(weights[ind]))
weights[ind] -= alpha * (np.transpose(predict_for_layers[ind + 1]).dot(delta_weights))
weights_inp = weights[::-1]
b_inp = b_coef[::-1]
activations_inp = activations[::-1]
for indx in range(1, len(self.layers)):
if layers[indx] == 1:
self.layers[indx] = DenseLayer(units=units[indx - 1], weights=weights_inp[indx - 1], b=b_inp[indx - 1], activation=activations_inp[indx - 1])
# Предсказание значений
def predict(self, x):
predict = self.layers[0](x)
for i in range(1, len(self.layers)):
#print(predict[0].shape)
predict = self.layers[i](predict[0])
return predict
I will give a simple example of use:
model = Sequential([
Input(),
DenseLayer(units=3, activation='relu'),
DenseLayer(units=1, activation='relu')
])
x = np.array([[3., 2.], [2., 2.], [3., 3.], [4., 4.]])
y = [5, 4, 6, 8]
#print(x.shape)
model.fit(x, y, epochs=40)
x_test = np.array([[3., 4.], [4., 4.]])
print(model.predict(x_test)[0])
Entrance:
[[7.13999622]
[8. ]]
Conclusions
As you can see, the neural network after training was able to give an almost correct answer to an example that it did not encounter during training.
There are many flaws in the implementation, so if there are comments to fix – write.