1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270
| import torch import os from datetime import datetime
from torch.optim import AdamW from tqdm import tqdm from transformers import GPT2LMHeadModel, GPT2Config, BertTokenizerFast from transformers import get_linear_schedule_with_warmup
from Gpt2_Chatbot.data_preprocess.dataloader import get_dataloader from Gpt2_Chatbot.functions_tools import calculate_acc from Gpt2_Chatbot.parameter_config import ParameterConfig
def train_epoch(model, train_dataloader, optimizer, scheduler, epoch, args): ''' :param model: GPT2模型 :param train_dataloader: 训练数据集 :param optimizer: 优化器:更新参数 :param scheduler: 学习率预热 :param epoch: 当前的轮次 :param args: 模型配置文件的参数对象 :return: 每次迭代的平均损失值 ''' print("start training...") model.train() device = args.device
epoch_start_time = datetime.now() total_loss = 0 epoch_correct_num, epoch_total_num = 0, 0
for batch_idx, (input_ids, labels) in enumerate(tqdm(train_dataloader)): input_ids = input_ids.to(device) labels = labels.to(device) outputs = model(input_ids, labels=labels) logits = outputs.logits loss = outputs.loss
total_loss += loss.item()
ignore_index = args.ignore_index batch_correct_num, batch_total_num = calculate_acc(logits, labels, ignore_index=ignore_index)
batch_acc = batch_correct_num / batch_total_num epoch_correct_num += batch_correct_num epoch_total_num += batch_total_num
''' self.gradient_accumulation_steps = 4 累积的步数 如果设置了梯度累积步数且大于1,则需要对损失进行相应的调整 这是因为在累积步数内,损失会被累积计算,而不是在每个训练步骤后立即更新权重 通过将损失除以梯度累积步数,可以得到每次累积后实际应该应用的平均损失 ''' if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps
loss.backward() ''' 作用:当max_norm/total_norm <1时, 将梯度乘以缩放系数,避免梯度爆炸 参数: parameters:模型的参数 max_norm:最大梯度范数,超过这个值则会进行梯度裁剪 ''' torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=args.max_grad_norm)
if (batch_idx + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() optimizer.zero_grad()
if (batch_idx + 1) % args.loss_step == 0: print( "batch {} of epoch {}, loss {:.4f}, batch_acc {:.4f}, lr {}".format( batch_idx + 1, epoch + 1, loss.item() * args.gradient_accumulation_steps, batch_acc, scheduler.get_last_lr()[0]))
epoch_mean_loss = total_loss / len(train_dataloader) epoch_mean_acc = epoch_correct_num / epoch_total_num print("epoch {}: loss {:.4f}, predict_acc {:.4f}".format(epoch + 1, epoch_mean_loss, epoch_mean_acc))
if (epoch + 1) % 2 == 0 or epoch == args.epochs: print('正在保存第 {} 轮次模型'.format(epoch + 1)) model_path = os.path.join(args.save_model_path, 'bj_epoch{}'.format(epoch + 1)) if not os.path.exists(model_path): os.mkdir(model_path) model.save_pretrained(model_path) print('第 {} 轮次模型保存完成。'.format(epoch + 1)) epoch_finish_time = datetime.now() print('完成本轮次所花时间为: {}'.format(epoch_finish_time - epoch_start_time))
return epoch_mean_loss
def validate_epoch(model, validate_dataloader, epoch, args): ''' 验证模型在一个epoch上的表现。 :param model: 要验证的模型 :param validate_dataloader: 验证数据加载器 :param epoch: 当前验证的epoch数 :param args: 包含设备信息等的参数对象 :return: 当前epoch的平均损失 ''' print("start validating...") model.eval() device = args.device
epoch_start_time = datetime.now() total_loss = 0
with torch.no_grad(): for batch_idx, (input_ids, labels) in enumerate(tqdm(validate_dataloader)): input_ids = input_ids.to(device) labels = labels.to(device) outputs = model(input_ids, labels=labels)
loss = outputs.loss total_loss += loss.item()
epoch_mean_loss = total_loss / len(validate_dataloader) print("第 {} 轮的模型在验证集上的平均损失为:{:.4f}".format(epoch + 1, epoch_mean_loss)) epoch_finish_time = datetime.now() print('完成本轮次验证所花时间为: {}'.format(epoch_finish_time - epoch_start_time)) return epoch_mean_loss
def train(model, train_dataloader, validate_dataloader, args): optimizer = AdamW(model.parameters(), lr=args.lr, eps=args.eps) ''' 这里使用学习率预热处理优化 学习率预热的目的:让模型在初始阶段更快的适应数据,避免训练过程中学习率过大或过小带来训练不稳定或者收敛速度太慢的问题,从而提高模型训练效果和繁华性能 实现方式:在初始阶段,将学习率从较小的值逐步增加到预设的初始值,然后按照我们设定的学习调整策略逐渐变小。 get_linear_schedule_with_warmup:使用这个方法来实现学习率从0增大到预设的学习率,然后再逐渐减低到0。【在这个方法中,学习率的变化都是线性的,调用其他非线性方法也可以】 optimizer:优化器对象,代表在训练过程中更新模型权重的优化器,比如AdamW, Adam, SGD num_warmup_steps:预热步数。指的是学习率从0增加到预设的学习率所需要的步数。 num_training_steps:整个训练过程的总的步数。表示优化器在给定的数据集上进行多少次参数更新。 ''' t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.epochs scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
train_losses, validate_losses = [], [] best_val_loss = 10000 for epoch in range(args.epochs): train_loss = train_epoch( model=model, train_dataloader=train_dataloader, optimizer=optimizer, scheduler=scheduler, epoch=epoch, args=args) train_losses.append(train_loss) validate_loss = validate_epoch( model=model, validate_dataloader=validate_dataloader, epoch=epoch, args=args) validate_losses.append(validate_loss)
if validate_loss < best_val_loss: best_val_loss = validate_loss print('保存当前最好的模型,轮次为 epoch {}'.format(epoch + 1)) model_path = os.path.join(args.save_model_path, 'min_loss_model_bj'.format(epoch + 1)) if not os.path.exists(model_path): os.mkdir(model_path) model.save_pretrained(model_path)
def run(): params = ParameterConfig() train_dataloader, validate_dataloader = get_dataloader(params.train_path, params.valid_path) print(f'train_dataloader-->{len(train_dataloader)}') print(f'validate_dataloader-->{len(validate_dataloader)}')
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
if params.pretrained_model: model = GPT2LMHeadModel.from_pretrained(params.pretrained_model) else: model_config = GPT2Config.from_json_file(params.config_json) model = GPT2LMHeadModel(config=model_config) model = model.to(params.device)
tokenizer = BertTokenizerFast(params.vocab_path, sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]") assert model.config.vocab_size == tokenizer.vocab_size
if not os.path.exists(params.save_model_path): os.mkdir(params.save_model_path)
num_parameters = 0 parameters = model.parameters() for parameter in parameters: num_parameters += parameter.numel() print(f'模型参数总量---》{num_parameters}')
train(model, train_dataloader, validate_dataloader, params)
if __name__ == '__main__': run()
|