Dans ce notebook, nous présenterons LeNet, l'un des premiers CNN publiés à attirer l'attention pour ses performances dans les tâches de vision par ordinateur. Le modèle a été introduit par (et nommé pour) Yann LeCun, alors chercheur chez AT&T Bell Labs, dans le but de reconnaître des chiffres manuscrits dans des images. Ce travail représentait l'aboutissement d'une décennie de recherche pour développer cette technologie. En 1989, LeCun a publié la première étude qui a réussi à entrainer des CNN par rétro-propagation.
À l'époque, LeNet a obtenu des résultats exceptionnels égalant les performances des SVM, qui était alors une approche dominante de l'apprentissage supervisé. LeNet a finalement été adapté pour reconnaître les chiffres pour le traitement des dépôts dans les distributeurs automatiques de billets. Aujourd'hui encore, certains distributeurs automatiques de billets utilisent le code que Yann et son collègue Leon Bottou ont écrit dans les années 1990 !
LeNet (LeNet-5) se compose de deux parties
L'architecture peut être résumée comme suit
import torch
from torch import nn
from d2l import torch as d2l
net = nn.Sequential(
nn.Conv2d(1, 6, kernel_size=5, padding=2), nn.Sigmoid(),
nn.AvgPool2d(kernel_size=2, stride=2),
nn.Conv2d(6, 16, kernel_size=5), nn.Sigmoid(),
nn.AvgPool2d(kernel_size=2, stride=2),
nn.Flatten(),
nn.Linear(16 * 5 * 5, 120), nn.Sigmoid(),
nn.Linear(120, 84), nn.Sigmoid(),
nn.Linear(84, 10))
Nous avons légèrement modifié le modèle original, en supprimant l'activation gaussienne dans la dernière couche. A part cela, ce réseau correspond à l'architecture originale de LeNet-5.
En faisant passer une image monocanal (noir et blanc) $28 \times 28$ image à travers le réseau et en imprimant la forme de sortie à chaque couche, on peut inspecter le modèle
X = torch.rand(size=(1, 1, 28, 28), dtype=torch.float32)
for layer in net:
X = layer(X)
print(layer.__class__.__name__,'output shape: \t',X.shape)
batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size=batch_size)
def evaluate_accuracy_gpu(net, data_iter, device=None): #@save
"""Compute the accuracy for a model on a dataset using a GPU."""
if isinstance(net, nn.Module):
net.eval() # Set the model to evaluation mode
if not device:
device = next(iter(net.parameters())).device
# No. of correct predictions, no. of predictions
metric = d2l.Accumulator(2)
with torch.no_grad():
for X, y in data_iter:
if isinstance(X, list):
# Required for BERT Fine-tuning (to be covered later)
X = [x.to(device) for x in X]
else:
X = X.to(device)
y = y.to(device)
metric.add(d2l.accuracy(net(X), y), y.numel())
return metric[0] / metric[1]
def train_ch6(net, train_iter, test_iter, num_epochs, lr, device):
"""Train a model with a GPU (defined in Chapter 6)."""
def init_weights(m):
if type(m) == nn.Linear or type(m) == nn.Conv2d:
nn.init.xavier_uniform_(m.weight)
net.apply(init_weights)
print('training on', device)
net.to(device)
optimizer = torch.optim.SGD(net.parameters(), lr=lr)
loss = nn.CrossEntropyLoss()
animator = d2l.Animator(xlabel='epoch', xlim=[1, num_epochs],
legend=['train loss', 'train acc', 'test acc'])
timer, num_batches = d2l.Timer(), len(train_iter)
for epoch in range(num_epochs):
# Sum of training loss, sum of training accuracy, no. of examples
metric = d2l.Accumulator(3)
net.train()
for i, (X, y) in enumerate(train_iter):
timer.start()
optimizer.zero_grad()
X, y = X.to(device), y.to(device)
y_hat = net(X)
l = loss(y_hat, y)
l.backward()
optimizer.step()
with torch.no_grad():
metric.add(l * X.shape[0], d2l.accuracy(y_hat, y), X.shape[0])
timer.stop()
train_l = metric[0] / metric[2]
train_acc = metric[1] / metric[2]
if (i + 1) % (num_batches // 5) == 0 or i == num_batches - 1:
animator.add(epoch + (i + 1) / num_batches,
(train_l, train_acc, None))
test_acc = evaluate_accuracy_gpu(net, test_iter)
animator.add(epoch + 1, (None, None, test_acc))
print(f'loss {train_l:.3f}, train acc {train_acc:.3f}, '
f'test acc {test_acc:.3f}')
print(f'{metric[2] * num_epochs / timer.sum():.1f} examples/sec '
f'on {str(device)}')
lr, num_epochs = 0.9, 10
train_ch6(net, train_iter, test_iter, num_epochs, lr, d2l.try_gpu())
import torch
from torch import nn
from d2l import torch as d2l
net = nn.Sequential(
# Here, we use a larger 11 x 11 window to capture objects. At the same
# time, we use a stride of 4 to greatly reduce the height and width of the
# output. Here, the number of output channels is much larger than that in
# LeNet
nn.Conv2d(1, 96, kernel_size=11, stride=4, padding=1), nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2),
# Make the convolution window smaller, set padding to 2 for consistent
# height and width across the input and output, and increase the number of
# output channels
nn.Conv2d(96, 256, kernel_size=5, padding=2), nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2),
# Use three successive convolutional layers and a smaller convolution
# window. Except for the final convolutional layer, the number of output
# channels is further increased. Pooling layers are not used to reduce the
# height and width of input after the first two convolutional layers
nn.Conv2d(256, 384, kernel_size=3, padding=1), nn.ReLU(),
nn.Conv2d(384, 384, kernel_size=3, padding=1), nn.ReLU(),
nn.Conv2d(384, 256, kernel_size=3, padding=1), nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2),
nn.Flatten(),
# Here, the number of outputs of the fully-connected layer is several
# times larger than that in LeNet. Use the dropout layer to mitigate
# overfitting
nn.Linear(6400, 4096), nn.ReLU(),
nn.Dropout(p=0.5),
nn.Linear(4096, 4096), nn.ReLU(),
nn.Dropout(p=0.5),
# Output layer. Since we are using Fashion-MNIST, the number of classes is
# 10, instead of 1000 as in the paper
nn.Linear(4096, 10))
## attention AlexNet prend des entrées (224, 224) au lieu de (28, 28)
X = torch.randn(1, 1, 224, 224)
for layer in net:
X=layer(X)
print(layer.__class__.__name__,'output shape:\t',X.shape)
batch_size = 128
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=224)
lr, num_epochs = 0.01, 10
d2l.train_ch6(net, train_iter, test_iter, num_epochs, lr, d2l.try_gpu())