seq2seq

1 RNN案例-seq2seq英译法

1.1 seq2seq模型介绍

模型结构
- 编码器 encoder
- 解码器 decoder
- 编码器和解码器中可以使用RNN模型或者是transformer模型
工作流程
- 编码器生成上下文语义张量 -> 什么是nlp? 将问题转换成语义张量
- 解码器根据编码器的语义张量和上一时间步的预测值以及上一时间步的隐藏状态值进行当前时间步的预测
  - 自回归模式
局限性
- 信息瓶颈问题
- 长序列问题

1.2 数据集介绍

1748849163083

每行样本由英文句子和法文句子对组成, 中间用\t分隔开
英文句子是编码器的输入序列, 法文句子是解码器的输出序列(预测序列)对应的真实序列

1.3 案例实现步骤

1.3.1 文本清洗工具函数

utils.py

# 用于正则表达式
import re
# 用于构建网络结构和函数的torch工具包
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
# torch中预定义的优化方法工具包
import torch.optim as optim
import time
# 用于随机生成数据
import random
import numpy as np
import matplotlib.pyplot as plt

# 定义变量
# 选择设备 cpu/gpu
# 'cuda'->使用所有显卡  'cuda:0'->使用第一张显卡
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# 起始符号下标
# sos -> start of sentences
SOS_token = 0
# 结束符号下标
EOS_token = 1
# 文件路径
data_path = 'data/eng-fra-v2.txt'
# 最大句子长度, 预处理分析的结果
MAX_LENGTH = 10


# 定义处理文本的工具函数  处理句子中的特殊符号/大小写/换行符
def normalizeString(s: str):
	# 转换成小写, 并删掉两端的空白符号
	str = s.lower().strip()
	# 正则表达式匹配标签符号'.?!' 转换成 ' .?!'
	str = re.sub(r'([.!?])', r' \1', str)
	# print('str--->', str)
	# 正则表达式匹配除a-z.!?之外的其他的符号 转换成 ' '
	str = re.sub(r'[^a-z.!?]+', r' ', str)
	# print('str--->', str)
	return str


if __name__ == '__main__':
	str1 = 'I m sad.@'
	normalizeString(str1)

1.3.2 数据预处理

preprocess.py

清洗文本和构建词表

from utils import *


def my_getdata():
	# todo:1- 读取文件数据集, 得到 [[英文句子1, 法文句子1], [英文句子2, 法文句子2], ...]内存数据集
	# 1-1 with open 读取文件数据集
	with open(data_path, 'r', encoding='utf-8') as f:
		my_lines = f.read().strip().split('\n')
		# print('my_lines --->', my_lines)
	# 1-2 获取 [[英文句子1, 法文句子1], [英文句子2, 法文句子2], ...] 数据集格式
	# 定义两个空列表
	tmp_pair, my_pairs = [], []
	# 循环遍历my_lines
	for line in my_lines:
		# print('line--->', line)  # i m .	j ai ans .
		# 对my_lines中每行样本使用\t分割符进行分割后再循环遍历
		for item in line.split('\t'):
			# print('item--->', item)
			# 将每行样本中的英文句子和法文句子使用工具函数进行清洗, 保存到tmp_pair列表中
			tmp_pair.append(normalizeString(item))
		# 将tmp_pair列表保存到my_pairs列表中
		my_pairs.append(tmp_pair)
		# 重置tmp_pair列表
		tmp_pair = []
	# print('my_pairs的长度为--->', len(my_pairs))
	# print('my_pairs[:4]--->', my_pairs[:4])
	
	# todo:2-构建英文和法文词表 {词:下标} {下标:词}
	# 2-0: 初始化词表, 有SOS和EOS两个词
	english_word2index = {'SOS':0, 'EOS':1}
	# 定义第3个词起始下标
	english_word_n = 2
	french_word2index = {'SOS': 0, 'EOS': 1}
	french_word_n = 2
	
	# 2-1: 循环遍历my_pairs [['i m .', 'j ai ans .'], ...]
	for pair in my_pairs:
		# print('pair--->', pair)  # ['i m .', 'j ai ans .']
		# 2-2: 对英文句子或法文句子根据 ' '空格进行分割, 再进行循环遍历
		for word in pair[0].split(' '):
			# print('word--->', word)  # i  m  .
			# 2-3: 使用if语句, 判断当前词是否在词表中, 如果不在添加进去
			if word not in english_word2index.keys():
				english_word2index[word] = english_word_n
				# 更新词下标
				english_word_n+=1
		for word in pair[1].split(' '):
			# 2-3: 使用if语句, 判断当前词是否在词表中, 如果不在添加进去
			if word not in french_word2index.keys():
				french_word2index[word] = french_word_n
				# 更新词下标
				french_word_n+=1
	
	# 2-4 获取{下标:词}格式词表
	english_index2word = {v:k for k, v in english_word2index.items()}
	french_index2word = {v:k for k, v in french_word2index.items()}
	# print('english_word2index--->', len(english_word2index), english_word2index)
	# print('french_word2index--->', len(french_word2index), french_word2index)
	# print('english_index2word--->', len(english_index2word), english_index2word)
	# print('french_index2word--->', len(french_index2word), french_index2word)
	# print('english_word_n--->', english_word_n)
	# print('french_word_n--->', french_word_n)
	return english_word2index, english_index2word, english_word_n, french_word2index, french_index2word, french_word_n, my_pairs


if __name__ == '__main__':
	(english_word2index, english_index2word, english_word_n, 
	 french_word2index, french_index2word, french_word_n, my_pairs) = my_getdata()

构建数据源对象

# 自定义张量数据源类
class MyPairsDataset(Dataset):
	# todo:1- init构造方法, 初始化属性
	def __init__(self, my_pairs, english_word2index, french_word2index):
		self.my_pairs = my_pairs  # [[], [], ...]
		self.english_word2index = english_word2index
		self.french_index2word = french_word2index
		# 获取数据集长度
		self.sample_len = len(my_pairs)
	
	# todo:2- len方法, 返回数据集的长度
	def __len__(self):
		return self.sample_len
	
	# todo:3- getitem方法, 对数据进行处理, 转换成张量数据对象
	def __getitem__(self, index):
		"""
		转换成张量数据对象
		:param index: 数据集的下标 -> 第index个样本
		:return: tensor_x, tensor_y
		"""
		# 3-1: 修正index, 防止超过下标边界
		index = min(max(index, 0), self.sample_len - 1)
		# print('index--->', index)
		# 3-2: 获取当前index样本中的 x和y
		x = self.my_pairs[index][0]
		y = self.my_pairs[index][1]
		# print('x--->', x)
		# print('y--->', y)
		# 3-3: 将x和y的字符串数据转换成下标表示  词表
		# self.english_word2index[word]: 根据key获取字典中的value
		x = [self.english_word2index[word] for word in x.split(' ')]
		y = [self.french_index2word[word] for word in y.split(' ')]
		# print('x--->', x)
		# print('y--->', y)
		# 3-4: 每个样本最后加EOS下标 结束符号
		x.append(EOS_token)
		y.append(EOS_token)
		# print('x--->', x)
		# print('y--->', y)
		# 3-5: 将下标列表转换成张量对象
		# device: 将张量创建到对应的设备上 GPU/CPU
		tensor_x = torch.tensor(x, dtype=torch.long, device=device)
		tensor_y = torch.tensor(y, dtype=torch.long, device=device)
		# print('tensor_x--->', tensor_x)
		# print('tensor_y--->', tensor_y)
		return tensor_x, tensor_y


if __name__ == '__main__':
	(english_word2index, english_index2word, english_word_n,
	 french_word2index, french_index2word, french_word_n, my_pairs) = my_getdata()
	# 创建自定义数据源对象
	my_dataset = MyPairsDataset(my_pairs, english_word2index, french_word2index)
	print('my_dataset数据集条目数--->', len(my_dataset))
	print(my_dataset[0])
	# 创建数据加载器对象
	my_dataloader = DataLoader(dataset=my_dataset, batch_size=1, shuffle=True)
	# 循环遍历数据加载器
	for i, (x, y) in enumerate(my_dataloader):
		print('x--->', x.shape, x)
		print('y--->', y.shape, y)
		break

1.3.3 构建基于GRU的编码器和解码器

构建基于GRU的编码器 encoderrnn.py

from preprocess import *
class EncoderRNN(nn.Module):
	# todo:1- 定义构造方法 init
	def __init__(self, input_size, hidden_size):
		super().__init__()
		# 输入特征维度属性  input_size是英文词表的大小
		self.input_size = input_size
		# 词嵌入层和隐藏层特征维度属性  共用
		self.hidden_size = hidden_size
		# 词嵌入层对象属性
		self.embedding = nn.Embedding(num_embeddings=self.input_size,
		                              embedding_dim=self.hidden_size)
		# gru层对象属性
		# input_size: 上一层输出特征维度数
		# hidden_size: 当前层输出特征维度数
		# batch_first: x和hidden形状 -> (句子数, 句子长度, 词维度)
		self.gru = nn.GRU(input_size=self.hidden_size, hidden_size=self.hidden_size, batch_first=True)
	# todo:2- 定义前向传播方法 forward
	def forward(self, input, hidden):
		# print('input--->', input.shape)
		# 词嵌入操作 词向量化
		embedded = self.embedding(input)
		# print('embedded--->', embedded.shape)
		# gru层前向传播操作
		output, hn = self.gru(embedded, hidden)
		# print('output--->', output.shape)
		# print('hn--->', hn.shape)
		return output, hn
	# todo:3- 定义初始化隐藏状态值方法 inithidden
	def inithidden(self):
		return torch.zeros(size=(1, 1, self.hidden_size), device=device)


if __name__ == '__main__':
	# 获取数据
	(english_word2index, english_index2word, english_word_n,
	 french_word2index, french_index2word, french_word_n, my_pairs) = my_getdata()
	# 创建张量数据集
	my_dataset = MyPairsDataset(my_pairs, english_word2index, french_word2index)
	# 创建数据加载器
	# batch_size: 当前设置为1, 因为句子长度不一致
	my_dataloader = DataLoader(dataset=my_dataset, batch_size=1, shuffle=True)
	# 创建编码器对象
	my_encoderrnn = EncoderRNN(input_size=english_word_n, hidden_size=256).to(device=device)
	for i, (x, y) in enumerate(my_dataloader):
		# 一次性喂数据
		# 初始化隐藏状态值
		hidden = my_encoderrnn.inithidden()
		encoder_output, hn = my_encoderrnn(x, hidden)
		print('encoder_output--->', encoder_output.shape)
		print('hn--->', hn.shape)
		
		# 一个时间步一个时间步喂数据, gru底层实现  了解,解码器需要这样操作
		hidden = my_encoderrnn.inithidden()
		# x.shape[1]: 获取当前x的token数, 时间步数
		for j in range(x.shape[1]):
			# print('x--->', x)
			# print('x[0]--->', x[0])
			# print('x[0][j]--->', x[0][j])
			tmp_x = x[0][j].view(1, -1)
			print('tmp_x--->', tmp_x)
			output, hidden = my_encoderrnn(tmp_x, hidden)
		print('观察：最后一个时间步output输出是否相等')  # hidden_size = 8 效果比较好
		print('encoder_output[0][-1]===>', encoder_output[0][-1])
		print('output===>', output)
		break

构建基于GRU的解码器 decoderrnn.py

from encoderrnn import *

class DecoderRNN(nn.Module):
	# todo:1- 定义构造方法 init
	def __init__(self, output_size, hidden_size):
		super().__init__()
		# 初始化法文词表大小维度属性=线性输出层的维度
		self.output_size = output_size
		# 初始化gru隐藏层和词嵌入层的维度属性  共用
		self.hidden_size = hidden_size
		# 初始化词嵌入层
		# num_embeddings: 法文词表大小
		# embedding_dim: 词向量初始维度
		self.embeding = nn.Embedding(num_embeddings=self.output_size, embedding_dim=self.hidden_size)
		# 初始化gru层
		self.gru = nn.GRU(input_size=self.hidden_size, hidden_size=self.hidden_size, batch_first=True)
		
		# 初始化全连接层 线性层+激活层
		# out_features: 法文词表大小  预测出n个词的生成概率
		self.out = nn.Linear(in_features=self.hidden_size, out_features=self.output_size)
		# dim:一定是-1, 按行处理
		self.softmax = nn.LogSoftmax(dim=-1)

	# todo:2- 定义前向传播方法 forward
	def forward(self, input, hidden):
		print('input--->', input.shape)
		# 词嵌入操作
		embedded = self.embeding(input)
		print('embedded--->', embedded.shape)
		# 通过relu激活函数引入非线性因素, 防止过拟合(x<0置为0, 神经元死亡)
		embedded = torch.relu(embedded)
		print('embedded--->', embedded.shape)
		# gru层操作
		# ouput: 输入input的语义信息, 形状为(句子数, 句子长度, 词维度) 三维
		output, hidden = self.gru(embedded, hidden)
		print('output--->', output.shape, output)
		# 全连接层操作
		# output[0]: 全连接层一般是二维数据, 所以要取出当前token的二维表示
		# 返回的output是 logsoftmax结果, 后续的值可能会有负值, 不是softmax的概率值
		output = self.softmax(self.out(output[0]))
		print('output--->', output.shape, output)
		return output, hidden
	
	
if __name__ == '__main__':
	# 获取数据
	(english_word2index, english_index2word, english_word_n,
	 french_word2index, french_index2word, french_word_n, my_pairs) = my_getdata()
	# 创建张量数据集
	my_dataset = MyPairsDataset(my_pairs, english_word2index, french_word2index)
	# 创建数据加载器
	# batch_size: 当前设置为1, 因为句子长度不一致
	my_dataloader = DataLoader(dataset=my_dataset, batch_size=1, shuffle=True)
	# 创建编码器对象
	my_encoderrnn = EncoderRNN(input_size=english_word_n, hidden_size=256).to(device=device)
	# 创建解码器对象
	output_size = french_word_n
	hidden_size = 256
	my_decoderrnn = DecoderRNN(output_size, hidden_size).to(device)
	for i, (x, y) in enumerate(my_dataloader):
		# 编码器进行编码 一次性喂数据
		# 初始化隐藏状态值
		hidden = my_encoderrnn.inithidden()
		encoder_output, hn = my_encoderrnn(x, hidden)
		print('encoder_output--->', encoder_output.shape)
		print('hn--->', hn.shape, hn)
		
		# 解码器进行解码, 自回归, 一个一个token进行解码
		for j in range(y.shape[1]) :
			# 获取当前预测token时间步的输入x(等同于上一时间步的预测y)
			# 当前以真实y中的每个token作为输入, 模拟解码器的界面过程, 实际上第一个输入token一定是起始符号
			tmp_y = y[0][j].view(1, -1)
			# 进行解码
			# 初始的隐藏状态值=编码器最后一个时间步的隐藏状态值
			my_decoderrnn(tmp_y, hn)
			break
		break

构建基于GRU和Attention的解码器 decoderrnn.py

# 带加性注意力机制的解码器
class AttnDecoderRNN(nn.Module):
	# todo:1- 定义构造方法 init
	def __init__(self, output_size, hidden_size, dropout_p=0.2, max_length=MAX_LENGTH):
		super().__init__()
		# 初始化词嵌入层的输入维度和全连接层的输出维度一致
		self.output_size = output_size
		# 初始化编码器解码器隐藏层维度属性  解码器的第一个隐藏状态值=编码器的最后一个隐藏状态值
		# 初始化词嵌入层维度属性  共享
		self.hidden_size = hidden_size
		# 初始化最大句子长度属性 -> 所有句子 c的长度固定
		self.max_length = max_length
		# 初始化dropout概率属性
		self.dropout_p = dropout_p
		# 初始化 embeding层
		self.embedding = nn.Embedding(num_embeddings=self.output_size, embedding_dim=self.hidden_size)
		# 初始化 gru层
		self.gru = nn.GRU(input_size=self.hidden_size, hidden_size=self.hidden_size, batch_first=True)
		# 初始化 全连接层
		self.out = nn.Linear(in_features=self.hidden_size, out_features=self.output_size)
		self.softmax = nn.LogSoftmax(dim=-1)
		
		# 初始化注意力机制中两个线性层
		"""
		q:解码器当前预测时间步的隐藏状态值
		k:解码器当前预测时间步的上一时间步隐藏状态值
		v:编码器的output输出
		q,k,v三个特征维度相同 都是hidden_size
		"""
		# in_features: q和k的特征维度拼接
		# out_features: 后续权重概率矩阵->(1, 1, max_len) 和 V矩阵相乘 V->(1, max_len, hidden_size)
		self.attn = nn.Linear(in_features=self.hidden_size + self.hidden_size, out_features=self.max_length)
		# in_features: q和c的特征维度拼接
		# out_features: 输出的维度和gru层的输入维度保持一致
		self.attn_combine = nn.Linear(in_features=self.hidden_size + self.hidden_size, out_features=self.hidden_size)
		# 初始化dropout层
		self.dropout = nn.Dropout(p=self.dropout_p)
	
	# todo:2- 定义前向传播方法 forward
	def forward(self, input, hidden, encoder_outputs):
		"""
		前向传播计算
		:param input: q, 解码器当前预测时间步的输入x, 也是上一个时间步预测的输出y
		:param hidden: k, 上一个时间步的隐藏状态值, 第一个时间步的上一个隐藏状态值=编码器最后一个时间步的隐藏状态值
		:param encoder_outputs: v, 编码器的输出 output, 后续是统一长度都为10, 10个token, 不足10个token用0填充
		:return: 预测词表概率向量, 当前时间步的隐藏状态值, 权重概率矩阵
		"""
		# 2-1 词嵌入操作
		embedded = self.embedding(input)
		# 使用dropout防止过拟合
		embedded = self.dropout(embedded)
		print('embedded--->', embedded.shape, embedded)
		
		# 2-2 计算权重分数矩阵, 之后再计算权重概率矩阵
		# q和k在特征维度轴拼接 + 线性计算 + softmax计算
		# embedded[0]: 获取二维向量表示, 线性层一般接收二维数据
		attn_weights = torch.softmax(self.attn(torch.cat(tensors=[embedded[0], hidden[0]], dim=1)), dim=-1)
		print('attn_weights--->', attn_weights.shape, attn_weights)
		# print(torch.sum(input=attn_weights))
		
		# 2-3 计算动态c, 加权求和  权重概率矩阵和v进行三维矩阵乘法
		# bmm() 三维矩阵乘法, 目前attn_weights和encoder_outputs二维矩阵
		attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))
		print('attn_applied--->', attn_applied.shape, attn_applied)
		
		# 2-4 q和动态c融合线性计算, 得到gru的输入x
		# unsqueeze():得到三维数据, gru的输入x的形状要求
		output = self.attn_combine(torch.cat(tensors=[embedded[0], attn_applied[0]], dim=1)).unsqueeze(0)
		print('output--->', output.shape, output)
		# relu激活函数, 非线性因素
		output = torch.relu(output)
		
		# 2-5 gru层操作
		output, hidden = self.gru(output, hidden)
		print('output--->', output.shape, output)
		print('hidden--->', hidden.shape, hidden)
		
		# 2-6 全连接层操作
		output = self.softmax(self.out(output[0]))
		print('output--->', output.shape, output)
		return output, hidden, attn_weights


if __name__ == '__main__':
	# 获取数据
	(english_word2index, english_index2word, english_word_n,
	 french_word2index, french_index2word, french_word_n, my_pairs) = my_getdata()
	# 创建张量数据集
	my_dataset = MyPairsDataset(my_pairs, english_word2index, french_word2index)
	# 创建数据加载器
	# batch_size: 当前设置为1, 因为句子长度不一致
	my_dataloader = DataLoader(dataset=my_dataset, batch_size=1, shuffle=True)
	# 创建编码器对象
	my_encoderrnn = EncoderRNN(input_size=english_word_n, hidden_size=256).to(device=device)
	# 创建解码器对象
	output_size = french_word_n
	hidden_size = 256
	# my_decoderrnn = DecoderRNN(output_size, hidden_size).to(device)
	
	# 创建带attn的解码器对象
	my_attndecoderrnn = AttnDecoderRNN(output_size, hidden_size).to(device)
	for i, (x, y) in enumerate(my_dataloader):
		# print('x--->', x.shape)
		# 编码器进行编码 一次性喂数据
		# 初始化隐藏状态值
		hidden = my_encoderrnn.inithidden()
		encoder_output, hn = my_encoderrnn(x, hidden)
		print('encoder_output--->', encoder_output.shape, encoder_output)
		# print('hn--->', hn.shape, hn)
		
		# 获取填充成最大程度的编码器c或者output
		# 初始化全0的张量 形状(10, 256) [[0,0,0,0,0,0,...],[],[]]
		encoder_output_c = torch.zeros(size=(MAX_LENGTH, my_encoderrnn.hidden_size), device=device)
		# 将encoder_output真实值赋值到encoder_output_c对应位置
		for idx in range(x.shape[1]):
			encoder_output_c[idx] = encoder_output[0][idx]
		print('encoder_output_c--->', encoder_output_c.shape, encoder_output_c)
		# 解码器进行解码, 自回归, 一个一个token进行解码
		for j in range(y.shape[1]):
			# 获取当前预测token时间步的输入x(等同于上一时间步的预测y)
			# 当前以真实y中的每个token作为输入, 模拟解码器的界面过程, 实际上第一个输入token一定是起始符号
			tmp_y = y[0][j].view(1, -1)
			# 进行解码
			# 初始的隐藏状态值=编码器最后一个时间步的隐藏状态值
			# my_decoderrnn(tmp_y, hn)
			# hn:编码器端最后一个时间步的隐藏状态值, 也是解码器端第一个时间步的初始的隐藏状态值
			print('hn--->', hn.shape, hn)
			output, hidden, attn_weights = my_attndecoderrnn(tmp_y, hn, encoder_output_c)
			print('=' * 80)
			print('output--->', output.shape, output)
			print('hidden--->', hidden.shape, hidden)
			print('attn_weights--->', attn_weights.shape, attn_weights)
			break
		break

1.3.4 构建模型训练函数并进行训练

Teacher Forcing介绍
- 概念
  - 解码时用真实y值作为输入
  - 一种增强模型训练效果的技术
- 作用
  - 加快模型收敛速度
  - 稳定模型训练过程
  - 使用真实y,损失值小
- 优点
  - 加快模型收敛速度
  - 稳定模型训练过程
  - 使用真实y,损失值小
- 缺点
  - 训练和测试时不一致, 测试推理没有真实y, 导致模型过拟合
- 改进方法
  - Scheduled Sampling: 计划采样
    - 随机生成一个随机数和Teacher Forcing比例进行比较
    - if判断
      - 小于等于比例, 真实y
      - 大于比例, 预测y
  - Curriculum Learning: 课程学习
    - Teacher Forcing比例: 前期大, 后期小, 衰减
    - 调整训练样本的顺序
      - 先训练短句, 再训练长度
      - 先训练高质量句子, 再训练低质量句子

构建内部迭代训练函数 train.py

from decoderrnn import *

# 模型训练参数
mylr = 1e-4
epochs = 2
# 设置teacher_forcing比率为0.5
teacher_forcing_ratio = 0.5
# 1000次迭代打印一次信息
print_interval_num = 1000
# 100次迭代绘制损失曲线
plot_interval_num = 100

def train_iters(x,y,
                my_encoderrnn:EncoderRNN,
                my_attndecoderrnn:AttnDecoderRNN,
                myadam_encode: optim.Adam,
                myadam_decode: optim.Adam,
                mynllloss: nn.NLLLoss):
	"""
	模型训练的内部函数 -> 内循环代码封装
	:param x: 英文句子
	:param y: 真实法文句子
	:param my_encoderrnn: 编码器
	:param my_attndecoderrnn:  解码器
	:param myadam_encode: 编码器优化器
	:param myadam_decode: 解码器优化器
	:param mynllloss: 解码器损失函数对象
	:return: 当前句子的平均损失
	"""
	# todo:1- 切换模型训练模式
	my_encoderrnn.train()
	my_attndecoderrnn.train()
	# todo:2- 初始化编码器隐藏状态值
	encode_h0 = my_encoderrnn.inithidden()
	# todo:3- 调用编码器获取v和k output就是v k就是解码器的初始隐藏状态值
	encode_output, encode_hn = my_encoderrnn(x, encode_h0)
	# print('encode_output--->', encode_output.shape, encode_output)
	# print('encode_hn--->', encode_hn.shape, encode_hn)
	# todo:4- 处理v, 统一长度, 都是10  v
	encode_output_c = torch.zeros(size=(MAX_LENGTH, my_encoderrnn.hidden_size), device=device)
	# print('encode_output_c--->', encode_output_c.shape, encode_output_c)
	for idx in range(x.shape[1]):
		encode_output_c[idx] = encode_output[0, idx]
	# print('encode_output_c--->', encode_output_c.shape, encode_output_c)
	# todo:5- 准备解码器第一个时间步的参数 q,k,v
	# 准备k
	decode_hidden = encode_hn
	# 准备q
	input_y = torch.tensor(data=[[SOS_token]], device=device)
	# print('input_y--->', input_y.shape, input_y)
	# todo:6- 初始化变量, 存储信息
	myloss = 0.0  # 当前句子的总损失
	iters_num = 0  # 当前句子的token数
	# todo:7- 判断教师强制机制是否成立, 返回True或False
	user_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
	# print('user_teacher_forcing--->', user_teacher_forcing)
	# todo:8- 解码器自回归解码
	# 预测什么时候结束? ①到达循环次数,法文句子长度 ②预测出EOS_token
	for idx in range(y.shape[1]):
		# 调用解码器模型对象, 返回预测token, 隐藏状态值, 注意力机制概率矩阵
		output_y, hidden, attn_weights = my_attndecoderrnn(input_y, decode_hidden, encode_output_c)
		# print('output_y--->', output_y.shape, output_y)
		# 获取当前时间步真实的token
		target_y = y[0][idx].view(1)
		# print('target_y--->', target_y.shape, target_y)
		# 计算损失值
		myloss += mynllloss(output_y, target_y)
		# print('myloss--->', myloss)
		# 更新iters_num数
		iters_num += 1
		# print('iters_num--->', iters_num)
		# 使用教师强制机制, 判断下一时间步使用真实token还是预测token
		if user_teacher_forcing:
			# input_y = y[0][idx].view(1, -1)
			input_y = target_y.view(1, -1)
			# print('input_y--->', input_y.shape, input_y)
		else:
			# 返回最大值和对应的下标
			topv, topi = output_y.topk(1)
			# print('topv--->', topv)
			# print('topi--->', topi)
			# 预测出结束符号, 解码结束
			if topi.item() == EOS_token:
				break
			input_y = topi
		
	# todo:9-梯度清零, 反向传播, 梯度更新
	myadam_encode.zero_grad()
	myadam_decode.zero_grad()
	
	myloss.backward()
	
	myadam_encode.step()
	myadam_decode.step()
	
	# todo:10- 句子的平均损失  总损失/token数
	return myloss.item() / iters_num

构建模型训练函数 train.py

def train_seq2seq():
	# 获取数据
	(english_word2index, english_index2word, english_word_n,
	 french_word2index, french_index2word, french_word_n, my_pairs) = my_getdata()
	# 实例化 mypairsdataset对象  实例化 mydataloader
	mypairsdataset = MyPairsDataset(my_pairs, english_word2index, french_word2index)
	mydataloader = DataLoader(dataset=mypairsdataset, batch_size=1, shuffle=True)
	
	# 实例化编码器 my_encoderrnn 实例化解码器 my_attndecoderrnn
	my_encoderrnn = EncoderRNN(english_word_n, 256).to(device)
	my_attndecoderrnn = AttnDecoderRNN(output_size=french_word_n, hidden_size=256, dropout_p=0.1, max_length=10).to(
		device)
	
	# 实例化编码器优化器 myadam_encode 实例化解码器优化器 myadam_decode
	myadam_encode = optim.Adam(my_encoderrnn.parameters(), lr=mylr)
	myadam_decode = optim.Adam(my_attndecoderrnn.parameters(), lr=mylr)
	
	# 实例化损失函数 mycrossentropyloss = nn.NLLLoss()
	mynllloss = nn.NLLLoss()
	
	# 定义模型训练的参数
	plot_loss_list = []
	
	# 循环轮次 epoch
	for epoch_idx in range(1, epochs + 1):
		# 初始化打印日志的总损失 和 绘图总损失
		print_loss_total = 0.0
		plot_loss_total = 0.0
		# 开始时间
		starttime = time.time()
		# 循环迭代次数, batch数
		# start: 默认为0, 第一条数据的标号为0; 1->第一条数据的标号为1
		for item, (x, y) in enumerate(mydataloader, start=1):
			# 模型训练, 调用内部迭代函数
			loss = train_iters(x, y,
			                   my_encoderrnn,
			                   my_attndecoderrnn,
			                   myadam_encode,
			                   myadam_decode,
			                   mynllloss)
			# print('loss--->', loss)
			# 统计损失值
			print_loss_total += loss
			plot_loss_total += loss
			# 1000次迭代打印一次日志
			if item % 1000 == 0:
				print_loss_avg = print_loss_total / print_interval_num
				# 重置print_loss_total 0
				print_loss_total = 0.0
				# 打印日志，日志内容分别是：训练耗时，当前迭代步，当前进度百分比，当前平均损失
				print('轮次%d  损失%.6f 时间:%d' % (epoch_idx, print_loss_avg, time.time() - starttime))
			# 100次收集一次损失, 用于绘图
			if item % 100 == 0:
				plot_loss_avg = plot_loss_total / plot_interval_num
				plot_loss_list.append(plot_loss_avg)
				plot_loss_total = 0.0
		torch.save(my_encoderrnn.state_dict(), './model/my_encoderrnn_model_%d.bin' % (epoch_idx))
		torch.save(my_attndecoderrnn.state_dict(), './model/my_attndecoderrnn_model_%d.bin' % (epoch_idx))
	
	# 绘制损失值的曲线图
	plt.figure()
	plt.plot(plot_loss_list.detach().numpy())
	plt.savefig('./image/plot_loss_list.png')
	plt.show()
	return plot_loss_list


if __name__ == '__main__':
	plot_loss_list = train_seq2seq()

1.3.5 构建模型评估函数并测试

构建模型评估函数

from decoderrnn import *

PATH1 = 'model/my_encoderrnn_2.pth'
PATH2 = 'model/my_attndecoderrnn_2.pth'

def seq2seq_evaluate(x,
                     my_encoderrnn: EncoderRNN,
                     my_attndecoderrnn: AttnDecoderRNN,
                     french_index2word):
	"""
	推理内部函数, 得到预测的法文
	:param x: 需要推理的英文句子
	:param my_encoderrnn: 编码器
	:param my_attndecoderrnn: 解码器
	:param french_index2word: 法文词汇表, 根据最大概率的下标从词表中获取法文词
	:return: 法文列表, 注意力权重概率矩阵
	"""
	with torch.no_grad():
		my_encoderrnn.eval()
		my_attndecoderrnn.eval()
		# todo: 1- 编码器编码
		encode_h0 = my_encoderrnn.inithidden()
		encode_output, encode_hn = my_encoderrnn(x, encode_h0)
		# todo: 2- 处理编码的输出 得到解码器的参数v
		encode_output_c = torch.zeros(size=(MAX_LENGTH, my_encoderrnn.hidden_size), device=device)
		for idx in range(x.shape[1]):
			encode_output_c[idx] = encode_output[0, idx]
		# todo: 3- 准备解码器的q和k参数
		decode_hidden = encode_hn
		input_y = torch.tensor(data=[[SOS_token]], device=device)
		# todo: 4- 定义变量 预测词空列表
		decode_words = []
		# todo: 6- 创建(10,10)全0张量, 存储每个时间步的注意力权重
		# (10, 10) -> 10:最多10个时间步 10:权重概率矩阵特征数为10
		decoder_attentions = torch.zeros(size=(MAX_LENGTH, MAX_LENGTH), device=device)
		# todo: 7- 解码器解码
		for i in range(MAX_LENGTH):
			# 解码
			output_y, decode_hidden, attn_weights = my_attndecoderrnn(input_y, decode_hidden, encode_output_c)
			# print('attn_weights--->', attn_weights.shape, attn_weights)
			# 保存当前时间步的attn_weights
			decoder_attentions[i] = attn_weights
			# print('decoder_attentions--->', decoder_attentions.shape, decoder_attentions)
			# 获取当前时间步的预测结果 topv topi
			# topi = torch.argmax(output_y)
			topv, topi = output_y.topk(1)
			# 判断topi是否是EOS_token下标值
			# 如果是,解码结束
			if topi.item() == EOS_token:
				decode_words.append('<EOS>')
				break
			else:
				decode_words.append(french_index2word[topi.item()])
			# 进行下一个时间步的预测
			input_y = topi
	# 返回法文列表, 注意力权重概率矩阵
	# [: i+1]->后续的值都为0,没有意义
	return decode_words, decoder_attentions[: i+1]

调用模型评估函数

# 定义模型推理函数
def inference():
	# todo:1- 加载推理数据集
	(english_word2index, english_index2word, english_word_n,
	 french_word2index, french_index2word, french_word_n, my_pairs) = my_getdata()
	# todo:2- 创建张量数据集
	my_dataset = MyPairsDataset(my_pairs, english_word2index, french_word2index)
	# todo:3- 加载模型
	# 编码器模型对象
	my_encoderrnn = EncoderRNN(input_size=english_word_n, hidden_size=256)
	print('my_encoderrnn--->', my_encoderrnn)
	# 加载模型参数
	# map_location: 将模型加载到什么设备中
	# lambda storage,loc: storage:保存时在哪个设备,加载就在哪个设备
	# strict: 是否严格按照创建时键值匹配加载 -> init方法中gru层属性名
	# True: 匹配不成功, 报错 False: 不报错, 但是不执行不匹配的层
	my_encoderrnn.load_state_dict(torch.load(PATH1, map_location=lambda storage,loc: storage), strict=False)
	print('my_encoderrnn--->', my_encoderrnn)
	# 解码器模型对象
	my_attndecoderrnn = AttnDecoderRNN(output_size=french_word_n, hidden_size=256)
	my_attndecoderrnn.load_state_dict(torch.load(PATH2, map_location=lambda storage,loc: storage), strict=False)
	# todo:4- 准备3条测试样本
	my_samplepairs = [['i m impressed with your french .', 'je suis impressionne par votre francais .'],
	                  ['i m more than a friend .', 'je suis plus qu une amie .'],
	                  ['she is beautiful like her mother .', 'elle est belle comme sa mere .']]
	print('my_samplepairs--->', len(my_samplepairs))
	# todo:5- 对测试样本进行处理, 训练时怎么做特征工程,推理时一样
	for idx, pair in enumerate(my_samplepairs):
		x = pair[0]
		y = pair[1]
		# print('x--->', x)
		# print('y--->', y)
		# 对x转换成下标张量对象
		tem_x = [english_word2index[word] for word in x.split(' ')]
		tem_x.append(EOS_token)
		tensor_x = torch.tensor([tem_x], dtype=torch.long, device=device)
		# print('tensor_x--->', tensor_x.shape, tensor_x)
		# todo:6- 调用内部封装推理函数,进行推理
		decode_words, decoder_attentions = seq2seq_evaluate(tensor_x, my_encoderrnn, my_attndecoderrnn, french_index2word)
		# print('decode_words--->', decode_words)
		# print('decoder_attentions--->', decoder_attentions.shape, decoder_attentions)
		# todo:7- 将预测的法文列表转换成字符串文本
		output_sentence = ' '.join(decode_words)
		
		print('\n')
		print('需要推理的英文句子--->', x)
		print('真实的法文句子--->', y)
		print('推理的法文句子--->', output_sentence)


if __name__ == '__main__':
	inference()

attention张量制图

2 transformer介绍

概念
- transformer是基于自注意力机制的seq2seq模型/架构/框架
核心思想
- 基于注意力机制
- 自注意力
- 一般注意力
作用
- 捕获超长距离语义关系
- 并行计算
- 灵活性: 处理不同的数据, 文本/语音/图像/视频
- 扩展性: 层数和多头数量可调, transformer默认是6层, 8个头

3 transformer架构

1749118800244

输入部分
- 词嵌入层
- 位置编码层
输出部分
- 线性层
- softmax层
编码器部分
- 多头自注意力子层
- 前馈全连接子层
- 残差连接层
- 规范化层(层归一化)
解码器部分
- 掩码多头自注意力子层
- 编码器-解码器堵头一般注意力子层
- 前馈全连接子层
- 残差连接层
- 规范化层(层归一化)