当想要训练处理图像的深度学习模型时,遇到了大尺寸图像,如果我们直接使用全连接层来无脑叠加,那会造成以下几个问题:
(1)将图像展开为向量会丢失空间信息; (2)参数过多导致效率低下,训练困难; (3)大量的参数也很快会导致网络过拟合。
此时,使用CNN卷积神经网络,就能很好地解决上述问题。
今天,我将分享一下我自己使用pytorch搭建的CNN模型,识别由英文字母和数字组成的验证码图像。
阅读本文需提前掌握以下知识:
接下来我将开始介绍我是如何一步一步从0到1搭建这个模型并成功训练的
#图片宽高
width: 150
height: 30
#结果分类
alphabet: 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
#验证码长度
numchar: 5
train:
# 遍历数据集训练的次数
epoch: 100
# 批处理数量
batch_size: 32
# 学习率
lr: 0.001
# 训练数据存放路径
train_data: ./data2/train
# 测试数据存放路径
test_data: ./data2/test
# 是否使用gpu
is_gpu: True
# gpu并行处理进程数
num_workers: 3
# 训练后的模型输出的路径
out_model_path: ./model2
test:
# 测试用的模型路径
model_path: ./model2/model_76_91%.path
# 是否使用gpu
is_gpu: False
# 样例数据路径
samples_path: ./data2/samples
这些都是之后建模、训练、测试会用到的通用参数,为了避免在不同文件重复输入参数值,这里单独适用yaml统一管理这些配置参数。
该模型使用了尺寸为150*30的数据集,验证码长度为5。如果拿到了不同尺寸和验证码长度的数据集想要训练,只需要在这里修改即可。
这里我使用了3层卷积层+2层全连接层。
本来只使用了一层全连接层,但是准确度一直上不去,改为2层后,准确度能到91%
什么是flatten?
import torch.nn as nn
class CNN(nn.Module):
def __init__(self, num_class=36, num_char=4, width=180, height=100):
super(CNN, self).__init__()
self.num_class = num_class
self.num_char = num_char
# 卷积层后,全连接层的一维数组输入长度
# 512是卷积处理后图片的通道数,长度和宽度各除以16是因为图像经过了四次2*2池化层(MaxPool2d)
self.line_size = int(512 * (width // 2 // 2 // 2 // 2) * (height // 2 // 2 // 2 // 2))
self.conv1 = nn.Sequential(
# 输入的是RGB图像,所以是3通道。
# 这里设置该层有16个卷积核,所以输出是16通道
# padding(1,1)表示在图像上下左右各加1行、1列,保证在卷积后图像大小不变
nn.Conv2d(3, 16, 3, padding=(1, 1)),
# 池化层,保留图像每2*2片段像素的最大值
nn.MaxPool2d(2, 2),
# 对每个通道的图像都归一化,防止梯度爆炸
nn.BatchNorm2d(16),
# 激活函数
nn.ReLU()
)
self.conv2 = nn.Sequential(
nn.Conv2d(16, 64, 3, padding=(1, 1)),
nn.MaxPool2d(2, 2),
nn.BatchNorm2d(64),
nn.ReLU()
)
self.conv3 = nn.Sequential(
nn.Conv2d(64, 512, 3, padding=(1, 1)),
nn.MaxPool2d(2, 2),
nn.BatchNorm2d(512),
nn.ReLU()
)
self.conv4 = nn.Sequential(
nn.Conv2d(512, 512, 3, padding=(1, 1)),
nn.MaxPool2d(2, 2),
nn.BatchNorm2d(512),
nn.ReLU()
)
# 全连接层
self.fc = nn.Sequential(
nn.Linear(self.line_size, self.line_size),
# nn.Identity(),
# 输出应为 验证码长度*字符的分类数
nn.Linear(self.line_size, self.num_char * self.num_class)
)
def forward(self, x):
x = self.conv1(x)
x = self.conv2(x)
x = self.conv3(x)
x = self.conv4(x)
# resize输入数组的尺寸,相当于flatten
x = x.view(-1, self.line_size)
x = self.fc(x)
return x
Dataset是pytorch提供的对数据进行读取和预处理的工具类,这里不进行过多介绍。
import os
from PIL import Image
import torch
from torch.utils.data import Dataset
def img_loader(img_path):
img = Image.open(img_path)
# 将图像转换为 RGB
return img.convert('RGB')
# 处理数据集所在文件夹下的数据
def make_dataset(data_path, alphabet, num_class, num_char):
# 获取数据集所在文件夹的所有文件名
img_names = os.listdir(data_path)
samples = []
for img_name in img_names:
# 拼接每个图像数据集的路径
img_path = os.path.join(data_path, img_name)
# 找出该图像的label
target_str = img_name.replace("\\", '/').split('/')[-1].split('.')[0].split("_")[0]
# 判断lable和结果的长度是否一致
assert len(target_str) == num_char
target = []
# 创建每个数据的target数组 4 * alphabet,这里使用one hot
for char in target_str:
vec = [0] * num_class
vec[alphabet.find(char)] = 1
target += vec
# 加入数据集
samples.append((img_path, target))
# 返回数据集
return samples
class CaptchaData(Dataset):
def __init__(self, data_path, num_class=62, num_char=5, transform=None, target_transform=None,
alphabet="0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"):
super(Dataset, self).__init__();
self.data_path = data_path
self.num_class = num_class
self.num_char = num_char
self.transform = transform
self.target_transform = target_transform
self.alphabet = alphabet
self.samples = make_dataset(self.data_path, self.alphabet,
self.num_class, self.num_char)
def __len__(self):
return len(self.samples)
def __getitem__(self, index):
img_path, target = self.samples[index]
img = img_loader(img_path)
# 如果有传入预处理函数,就预处理数据集
if self.transform is not None:
img = self.transform(img)
if self.target_transform is not None:
target = self.target_transform(target)
return img, torch.Tensor(target)
import logging
import torch
import torch.nn as nn
import yaml
from torch.autograd import Variable
from models import CNN
from datasets import CaptchaData
from torch.utils.data import DataLoader
from torchvision.transforms import Compose, ToTensor, Resize
import time
import os
logging.basicConfig(level=logging.INFO,
format='%(asctime)s -[PID:%(process)s]-%(levelname)s-%(funcName)s-%(lineno)d: [ %(message)s ]',
datefmt="%Y-%m-%d %H:%M:%S")
with open('./config.yaml', 'r', encoding='utf-8') as f_config:
config_result = f_config.read()
config = yaml.load(config_result, Loader=yaml.FullLoader)
batch_size = config["train"]["batch_size"]
base_lr = config["train"]["lr"]
max_epoch = config["train"]["epoch"]
model_path = config["train"]["out_model_path"]
train_data_path = config["train"]["train_data"]
test_data_path = config["train"]["test_data"]
num_workers = config["train"]["num_workers"]
use_gpu = config["train"]["is_gpu"]
width = config["width"]
height = config["height"]
alphabet = config["alphabet"]
numchar = config["numchar"]
# restor = False
if not os.path.exists(model_path):
logging.info("新建训练模型保存路径:{}".format(model_path))
os.makedirs(model_path)
# 计算准确度
def calculat_acc(output, target):
output, target = output.view(-1, len(alphabet)), target.view(-1, len(alphabet))
output = nn.functional.softmax(output, dim=1)
output = torch.argmax(output, dim=1)
target = torch.argmax(target, dim=1)
output, target = output.view(-1, int(numchar)), target.view(-1, int(numchar))
correct_list = []
for i, j in zip(target, output):
if torch.equal(i, j):
correct_list.append(1)
else:
correct_list.append(0)
acc = sum(correct_list) / len(correct_list)
return acc
def train():
# 数据shape的预处理
transforms = Compose([Resize((height, width)), ToTensor()])
# 创建训练数据集对象
train_dataset = CaptchaData(train_data_path, num_class=len(alphabet), num_char=int(numchar), transform=transforms, alphabet=alphabet)
# 初始化DataLoader,之后训练的数据由它按照我们的要求如batch_size等提供
train_data_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=num_workers,
shuffle=True, drop_last=True)
# 创建测试数据集对象
test_data = CaptchaData(test_data_path, num_class=len(alphabet), num_char=int(numchar), transform=transforms, alphabet=alphabet)
test_data_loader = DataLoader(test_data, batch_size=batch_size,
num_workers=num_workers, shuffle=True, drop_last=True)
# 初始化模型
cnn = CNN(num_class=len(alphabet), num_char=int(numchar), width=width, height=height)
if use_gpu:
cnn.cuda()
# 使用Adam优化方法
optimizer = torch.optim.Adam(cnn.parameters(), lr=base_lr)
# 使用多标签分类的损失函数
criterion = nn.MultiLabelSoftMarginLoss()
# 训练我们指定的epoch次
for epoch in range(max_epoch):
start_ = time.time()
loss_history = []
acc_history = []
# 切换到训练模式
cnn.train()
for img, target in train_data_loader:
# img = Variable(img)
# target = Variable(target)
if use_gpu:
img = img.cuda()
target = target.cuda()
# 获取神经网络的输出
output = cnn(img)
# 计算损失函数
loss = criterion(output, target)
# 初始化梯度
optimizer.zero_grad()
# 反向传播计算梯度
loss.backward()
# 优化参数
optimizer.step()
# 计算准确度
acc = calculat_acc(output, target)
acc_history.append(float(acc))
loss_history.append(float(loss))
print('epoch:{},train_loss: {:.4}|train_acc: {:.4}'.format(
epoch,
torch.mean(torch.Tensor(loss_history)),
torch.mean(torch.Tensor(acc_history)),
))
loss_history = []
acc_history = []
# 切换到测试模式
cnn.eval()
for img, target in test_data_loader:
# img = Variable(img)
# target = Variable(target)
if torch.cuda.is_available():
img = img.cuda()
target = target.cuda()
output = cnn(img)
acc = calculat_acc(output, target)
acc_history.append(float(acc))
print('test_loss: {:.4}|test_acc: {:.4}'.format(
torch.mean(torch.Tensor(loss_history)),
torch.mean(torch.Tensor(acc_history)),
))
print('epoch: {}|time: {:.4f}'.format(epoch, time.time() - start_))
torch.save(cnn.state_dict(), os.path.join(model_path, "model_{}.path".format(epoch)))
if __name__ == "__main__":
train()
import logging
import torch
import torch.nn as nn
import yaml
from PIL import Image
from models import CNN
from torchvision.transforms import Compose, ToTensor, Resize
import matplotlib.pyplot as plt
import os
import random
logging.basicConfig(level=logging.INFO,
format='%(asctime)s -[PID:%(process)s]-%(levelname)s-%(funcName)s-%(lineno)d: [ %(message)s ]',
datefmt="%Y-%m-%d %H:%M:%S")
with open('./config.yaml', 'r', encoding='utf-8') as f_config:
config_result = f_config.read()
config = yaml.load(config_result, Loader=yaml.FullLoader)
# 成品模型路径
model_path = config["test"]["model_path"]
# 是否使用gpu
use_gpu = config["train"]["is_gpu"]
# 图片宽度
width = config["width"]
# 图片高度
height = config["height"]
# 结果类别
alphabet = config["alphabet"]
# 结果个数
numchar = config["numchar"]
# 样例数据路径
samples_path = config["test"]["samples_path"]
model_net = CNN()
# 获取模型
def load_net():
global model_net
# 初始化模型
model_net = CNN(num_class=len(alphabet), num_char=int(numchar), width=width, height=height)
# 读取成品模型
if use_gpu:
model_net = model_net.cuda()
model_net.eval()
model_net.load_state_dict(torch.load(model_path))
else:
model_net.eval()
model_net.load_state_dict(torch.load(model_path, map_location='cpu'))
# 预测验证码
def predict_image(img):
global model_net
with torch.no_grad():
img = img.convert('RGB')
transforms = Compose([Resize((height, width)), ToTensor()])
img = transforms(img)
if use_gpu:
img = img.view(1, 3, height, width).cuda()
else:
img = img.view(1, 3, height, width)
output = model_net(img)
output = output.view(-1, len(alphabet))
output = nn.functional.softmax(output, dim=1)
output = torch.argmax(output, dim=1)
output = output.view(-1, numchar)[0]
return ''.join([alphabet[i] for i in output.cpu().detach().numpy()])
if __name__ == "__main__":
load_net()
# 枚举数据所在文件夹
img_names = os.listdir(samples_path)
random.shuffle(img_names)
samples = []
for img_name in img_names:
# 拼接每个数据的路径
img_path = os.path.join(samples_path, img_name)
img = Image.open(img_path)
v_code = predict_image(img)
plt.figure()
plt.title("{}".format(v_code))
plt.imshow(img)
plt.show()
该模型使用了19000个训练集和1000个测试集,准确度最终达到了91%,并有20000个样例对训练好的模型进行试验。
验证码识别虽然是CNN中非常简单的应用,但是通过本次实践,能够基本掌握如何使用pytorch训练CNN模型,之后的CNN模型搭建,都可以参照上述思路。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。