01. CNN MNIST
01. CNN MNIST
CNN and PyTorch
1. Overall Pipeline of Model Implementation
A typical PyTorch workflow:
1
Dataset → DataLoader → Model → Loss → Optimizer → Train → Evaluate
1-1. Device Setup
1
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
- Use GPU if available, otherwise CPU
1
2
print(torch.cuda.is_available())
print(torch.cuda.device_count())
2. Dataset
1
2
3
4
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.5),(0.5,))
])
| Step | Description |
|---|---|
| ToTensor | Converts image to tensor (0~1) |
| Normalize | Stabilizes training |
Normalization is critical for CNN convergence
1
2
3
4
from torchvision import datasets, transforms
train_dataset = datasets.MNIST(root='./data', train=True, transform=transform, download=True)
test_dataset = datasets.MNIST(root='./data', train=False, transform=transform)
2-1. DataLoader
1
2
3
from torch.utils.data import DataLoader
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
| Parameter | Meaning |
|---|---|
| batch_size | samples per iteration |
| shuffle | improves generalization |
| num_workers | parallel data loading |
- Larger batch → better GPU utilization
- Increase
num_workers→ reduce IO bottleneck
3. CNN
1
2
3
4
5
6
7
8
9
10
11
12
class CNN(nn.Module):
def __init__(self):
super(CNN, self).__init__()
self.conv1 = nn.Conv2d(1, 32, kernel_size=3)
self.conv2 = nn.Conv2d(32, 64, kernel_size=3)
self.pool = nn.MaxPool2d(2,2)
self.fc1 = nn.Linear(64*12*12, 128)
self.fc2 = nn.Linear(128, 10)
self.relu = nn.ReLU()
3-1. Convolution Layers
1
2
3
nn.Conv2d(in_channel, out_channel, kernel_size,
stride=1, padding=0, dilation=1, groups=1,
bias=True, padding_mode='zeros')
in_channels / out_channels
1
nn.Conv2d(1, 32, 3)
in_channels: input feature channels (e.g., grayscale=1, RGB=3)out_channels: number of filters (feature maps)
More channels = more expressive power, but higher compute/memory
kernel_size
1
kernel_size=3
- Size of the convolution filter (e.g., 3x3, 5x5)
stride
1
stride=2
- Controls how far the kernel moves
| stride | effect |
|---|---|
| 1 | detailed features |
| 2 | downsampling |
- Often replaces pooling in modern architectures
padding
1
padding=1
- Adds zeros around input
dilation
1
dilation=2
- Expands kernel spacing
Example:
1
3x3 kernel → behaves like larger receptive field
- semantic segmentation
- context-aware models
groups
1
groups=1
Controls channel connectivity
| groups | behavior |
|---|---|
| 1 | normal convolution |
| in_channels | depthwise convolution |
- MobileNet
- efficient models
bias
1
bias=True
- Adds bias term (ax + b, b is bias)
Often disabled when using BatchNorm:
1
bias=False
Output Size Formula
Understanding output shape is critical.
1
Output = (W - K + 2P) / S + 1
- W = input size
- K = kernel size
- P = padding
- S = stride
3-2. Pooling
1
self.pool = nn.MaxPool2d(2,2)
- Downsampling
- Reduces computation
- Adds translation invariance
3-3. Fully Connected
1
self.fc1 = nn.Linear(64*12*12, 128)
Shape Flow
1
2
3
4
5
28x28
→ conv1 → 26x26
→ conv2 → 24x24
→ pool → 12x12
→ flatten → 64 * 12 * 12
3-4. Activation
1
self.relu = nn.ReLU()
- Adds non-linearity
4. Forward Pass
1
2
3
4
5
6
7
8
def forward(self, x):
x = self.relu(self.conv1(x))
x = self.relu(self.conv2(x))
x = self.pool(x)
x = x.view(x.size(0), -1)
x = self.relu(self.fc1(x))
x = self.fc2(x)
return x
1
Conv → ReLU → Conv → ReLU → Pool → Flatten → FC → Output
5. Loss & Optimizer
1
2
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
- CrossEntropy = Softmax + Log + NLL
| Optimizer | Use Case |
|---|---|
| SGD | stable |
| Adam | fast |
| AdamW | better generalization |
6. Training Loop
1
2
3
4
5
6
for images, labels in train_loader:
optimizer.zero_grad() // initalize gradient
outputs = model(images)
loss = criterion(outputs, labels) // define loss function
loss.backward() // backpropagation just calculation
optimizer.step() // update weights
1
forward → loss → backward → update
Update weights at once included serveral train backpropagation
1
2
3
4
5
6
7
8
9
10
11
12
optimizer.zero_grad()
for i, (x, y) in enumerate(loader):
output = model(x)
loss = criterion(output, y)
loss.backward()
if (i+1) % 4 == 0:
optimizer.step()
optimizer.zero_grad()
Update weights after clipping
1
2
3
4
5
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step()
Update weights with combining multi loss training
1
2
3
4
5
6
loss1 = criterion1(...)
loss2 = criterion2(...)
(loss1 + loss2).backward()
optimizer.step()
Update weights when fine-tuning
1
2
3
4
5
6
7
loss.backward()
for name, param in model.named_parameters():
if "backbone" in name:
param.grad = None
optimizer.step()
7. Evaluation
1
model.eval()
- disables dropout / batchnorm
1
with torch.no_grad():
- disables gradient calculation
1
_, predicted = torch.max(outputs, 1)
- select predicted class
8. Accuracy
1
correct += (predicted == labels).sum().item()
9. Whole Code
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
import torch
print("cuda available:", torch.cuda.is_available())
print("device count:", torch.cuda.device_count())
if torch.cuda.is_available():
print("device name:", torch.cuda.get_device_name(0))
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5),(0.5,))])
train_dataset = datasets.MNIST(root='./data', train=True, transform=transform, download=True)
test_dataset = datasets.MNIST(root='./data', train=True, transform=transform)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64, shuffle=False)
class CNN(nn.Module):
def __init__(self):
super(CNN, self).__init__()
self.conv1 = nn.Conv2d(1,32,kernel_size=3)
self.conv2 = nn.Conv2d(32,64,kernel_size=3)
self.pool = nn.MaxPool2d(2,2)
self.fc1 = nn.Linear(64*12*12, 128)
self.fc2 = nn.Linear(128, 10)
self.relu = nn.ReLU()
def forward(self, x):
x = self.relu(self.conv1(x))
x = self.relu(self.conv2(x))
x = self.pool(x)
x = x.view(x.size(0), -1)
x = self.relu(self.fc1(x))
x = self.fc2(x)
return x
model = CNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = 0.001)
epochs = 10
for epoch in range(epochs):
model.train()
total_loss = 0
for images, labels in train_loader:
images, labels = images.to(device), labels.to(device)
optimizer.zero_grad()
outputs = model(images)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
total_loss += loss.item()
print(f"Epoch [{epoch+1}/{epochs}], Loss: {total_loss:4f}")
model.eval()
correct = 0
total = 0
with torch.no_grad():
for images, labels in test_loader:
images, labels = images.to(device), labels.to(device)
outputs = model(images)
_, predicted = torch.max(outputs, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
print(f"Accuracy: {100 * correct / total:.2f}%")
import matplotlib.pyplot as plt
# 데이터 하나 가져오기
image, label = train_dataset[1]
# tensor → numpy
img = image.numpy().squeeze()
plt.imshow(img, cmap='gray')
plt.title(f"Label: {label}")
plt.show()
This post is licensed under CC BY 4.0 by the author.