Activations used between layers
a = torch.randn(3, 3)
a, relu()(a)
x = torch.arange(-5, 3, 0.05)
y = Relu(inplace=False)(x)
plt.plot(x, y);
a = torch.randn(3, 3)
a, leaky_relu()(a)
x = torch.arange(-5, 3, 0.05)
y = LeakyReLU(inplace=False, negative_slope=0.5)(x)
plt.plot(x, y);
x = torch.arange(-5, 3, 0.05)
y = Mish()(x)
plt.plot(x, y);
sigmoid()(torch.randn((3,1)))
y = torch.randn((2,3))
p = softmax(y)
print(p)
print(p.sum(dim=-1, keepdim=True))