北京市昌平建设工程招标网站,秦皇岛网站制作小程序开发,wordpress主题:精选zruckmetro主题,公司网站建设一定要求原图吗目录
1 加载预训练模型对应的分词器
2 加载数据集
3 数据预处理
4 构建数据加载器DataLoader
5 定义下游任务模型
6 测试代码
7 训练代码 #做#xff08;中文与英文的#xff09;分类任务#xff0c;Bert模型比较合适#xff0c;用cls向下游任务传输数…目录
1 加载预训练模型对应的分词器
2 加载数据集
3 数据预处理
4 构建数据加载器DataLoader
5 定义下游任务模型
6 测试代码
7 训练代码 #做中文与英文的分类任务Bert模型比较合适用cls向下游任务传输数据做分类任务 #Bert模型要求一般传一个句子对两句话 1 加载预训练模型对应的分词器
from transformers import AutoTokenizer#use_fastTrue 表示使用RUST语言写的分词器速度比python写的快
tokenizer AutoTokenizer.from_pretrained(../data/model/distilbert-base-uncased/, use_fastTrue)tokenizer DistilBertTokenizerFast(name_or_path../data/model/distilbert-base-uncased/, vocab_size30522, model_max_length512, is_fastTrue, padding_sideright, truncation_sideright, special_tokens{unk_token: [UNK], sep_token: [SEP], pad_token: [PAD], cls_token: [CLS], mask_token: [MASK]}, clean_up_tokenization_spacesFalse), #编码试算
tokenizer.batch_encode_plus([hello, everyone, today is a good day, how are you, fine thank you, and you?])
#编码返回的是input_ids 和 attention_mask {input_ids: [[101, 7592, 1010, 3071, 1010, 2651, 2003, 1037, 2204, 2154, 102], [101, 2129, 2024, 2017, 1010, 2986, 4067, 2017, 1010, 1998, 2017, 1029, 102]], attention_mask: [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]} 2 加载数据集
from datasets import load_datasetdataset load_dataset(../data/datasets/cola/, trust_remote_codeTrue)
datasetDatasetDict({train: Dataset({features: [text, label],num_rows: 8551})test: Dataset({features: [text, label],num_rows: 527})
}) dataset[train][0] {text: Our friends wont buy this analysis, let alone the next one we propose.,label: 1} 3 数据预处理
def f(examples, tokenizer):只对传输数据集的句子文本text进行编码分词return tokenizer.batch_encode_plus(examples[text], truncationTrue)dataset dataset.map(f,batchedTrue,batch_size1000, #一批有1000个数据#num_proc1 更快 , 数据量不多的时候 创建进程也是需要时间开销num_proc1, #8个进程同时处理cpu是8核remove_columns[text], #原数据集中的[text]不要了转化成[input_ids]fn_kwargs{tokenizer: tokenizer})
print(dataset[train][0]) {label: 1, input_ids: [101, 2256, 2814, 2180, 1005, 1056, 4965, 2023, 4106, 1010, 2292, 2894, 1996, 2279, 2028, 2057, 16599, 1012, 102], attention_mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} 4 构建数据加载器DataLoader
#一批数据传输时每句话的长度必须相同 否则无法参与矩阵运算
import torch
#DataCollatorWithPadding 读取数据时自动补全padding使句子长度相同
from transformers.data.data_collator import DataCollatorWithPaddingloader torch.utils.data.DataLoader(datasetdataset[train],batch_size8,#实例化一个匿名的collate_fn ,使数据一批批传输并自动补全padding使句子长度相同collate_fnDataCollatorWithPadding(tokenizer), shuffleTrue,drop_lastTrue)for data in loader: break #for循环赋值 不输出
#data包含input_ids和 attention_mask 两部分data {input_ids: tensor([[ 101, 2043, 3021, 5610, 2015, 1010, 2035, 1996, 2062, 2515,6294, 5223, 2032, 1012, 102, 0, 0, 0, 0, 0,0],[ 101, 2057, 4687, 2008, 3021, 2187, 1012, 102, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0],[ 101, 2008, 2008, 2005, 5106, 15721, 2000, 5466, 1037, 4906,2052, 28679, 1996, 4932, 2001, 5793, 2003, 2025, 2995, 1012,102],[ 101, 1996, 2214, 3899, 2351, 2035, 1996, 2126, 1012, 102,0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0],[ 101, 1045, 2215, 2009, 2000, 3961, 1037, 3595, 2008, 3021,2187, 1012, 102, 0, 0, 0, 0, 0, 0, 0,0],[ 101, 2027, 2700, 2032, 2637, 1005, 1055, 17089, 2343, 1012,102, 0, 0, 0, 0, 0, 0, 0, 0, 0,0],[ 101, 1996, 2795, 2003, 2936, 2084, 1996, 2341, 2003, 2898,1012, 102, 0, 0, 0, 0, 0, 0, 0, 0,0],[ 101, 6294, 9619, 2098, 2000, 3046, 2000, 4025, 2000, 2031,2042, 4782, 1012, 102, 0, 0, 0, 0, 0, 0,0]]), attention_mask: tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]]), labels: tensor([1, 0, 1, 0, 1, 1, 1, 1])} len(loader) 1068 5 定义下游任务模型
from transformers import AutoModelForSequenceClassification, DistilBertModel
#查看模型参数与层结构
model_pretrained_parameters AutoModelForSequenceClassification.from_pretrained(../data/model/distilbert-base-uncased/, num_labels2)
model_pretrained_parameters DistilBertForSequenceClassification((distilbert): DistilBertModel((embeddings): Embeddings((word_embeddings): Embedding(30522, 768, padding_idx0)(position_embeddings): Embedding(512, 768)(LayerNorm): LayerNorm((768,), eps1e-12, elementwise_affineTrue)(dropout): Dropout(p0.1, inplaceFalse))(transformer): Transformer((layer): ModuleList((0-5): 6 x TransformerBlock((attention): MultiHeadSelfAttention((dropout): Dropout(p0.1, inplaceFalse)(q_lin): Linear(in_features768, out_features768, biasTrue)(k_lin): Linear(in_features768, out_features768, biasTrue)(v_lin): Linear(in_features768, out_features768, biasTrue)(out_lin): Linear(in_features768, out_features768, biasTrue))(sa_layer_norm): LayerNorm((768,), eps1e-12, elementwise_affineTrue)(ffn): FFN((dropout): Dropout(p0.1, inplaceFalse)(lin1): Linear(in_features768, out_features3072, biasTrue)(lin2): Linear(in_features3072, out_features768, biasTrue)(activation): GELUActivation())(output_layer_norm): LayerNorm((768,), eps1e-12, elementwise_affineTrue)))))(pre_classifier): Linear(in_features768, out_features768, biasTrue)(classifier): Linear(in_features768, out_features2, biasTrue)(dropout): Dropout(p0.2, inplaceFalse)
) class Model(torch.nn.Module):def __init__(self):super().__init__() #继承父类的方法self.model_pretrained DistilBertModel.from_pretrained(../data/model/distilbert-base-uncased/)#全连接层#Bert模型输出的数据的最后一维度是768这里输入的第0维度也要是768self.fc torch.nn.Sequential(torch.nn.Linear(768, 768),torch.nn.ReLU(),torch.nn.Dropout(p0.2),torch.nn.Linear(768, 2)) #二分类问题情感分析积极1/消极0#加载预训练参数的模型model_pretrained_parameters AutoModelForSequenceClassification.from_pretrained(../data/model/distilbert-base-uncased/,num_labels2) #labels的类别数量#让全连接层加载预训练的参数self.fc[0].load_state_dict(model_pretrained_parameters.pre_classifier.state_dict())self.fc[3].load_state_dict(model_pretrained_parameters.classifier.state_dict())#损失函数self.criterion torch.nn.CrossEntropyLoss()def forward(self, input_ids, attention_mask, labelsNone):#将输入数据传入预训练模型得到一个输出结果#logits是三维的logits self.model_pretrained(input_idsinput_ids, attention_maskattention_mask)# 使logits变成二维数据logits logits.last_hidden_state[:, 0] #0就是cls的输出结果因为cls的位置是固定的每句话的第一个单词就是其他位置具有不确定性能拿到数据#将logits传入输出层logits self.fc(logits)#计算损失loss None #先将loss设为空if labels is not None: #若传入了labels数据不为空了#计算损失loss self.criterion(logits, labels)return {loss: loss, logits: logits}model Model()
#查看模型参数量
print(sum(i.numel() for i in model.parameters())) 66955010 #试跑一下下游任务模型
#向模型中传入参数
out model(**data) #out是一个字典包含输出的loss和logits
print(out[loss], out[logits], out[logits].shape)
#out[logits].shapetorch.Size([8, 2]), 8是一批有8个数据 2是两个类别的概率哪个值更大就归哪个类别 tensor(0.6448, grad_fnNllLossBackward0) tensor([[-0.0228, 0.0688],[-0.1635, -0.0205],[-0.1123, 0.0630],[-0.0492, 0.0820],[-0.1185, 0.1382],[-0.1488, 0.1725],[-0.0806, 0.0836],[-0.0384, 0.0721]], grad_fnAddmmBackward0) torch.Size([8, 2]) #查看测试数据集的labels是否正常有效没有-1
dataset[test][0] {label: 1,input_ids: [101,1996,11279,8469,1996,9478,3154,1997,1996,5749,1012,102],attention_mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} 6 测试代码
def test(model):model.eval() #测试预测时调到评估模式#构建数据加载器loader_test torch.utils.data.DataLoader(datasetdataset[test],batch_size16, #测试预测是在cpu上进行的batch_size的值可以大一些为16#DataCollatorWithPadding(tokenizer)实例化collate_fn不然会报错collate_fnDataCollatorWithPadding(tokenizer), #成批输送数据时自动补全pad使句子长度一致shuffleTrue,drop_lastTrue)outs [] #存放计算的最大类别概率labels [] #存放真实值for i, data in enumerate(loader_test):#进行下游任务模型计算预测时不进行求导梯度下降with torch.no_grad():#out是一个字典包含loss和logitsout model(**data)#out[logits]是一个二维数组shapebatch_szie, 类别数量outs.append(out[logits].argmax(dim1))labels.append(data[labels])if i % 10 0: #每隔10次print(i)if i 50:break #到50停止#将outs和labels分别拼接起来outs torch.cat(outs)labels torch.cat(labels)#计算准确度accuracy (outs labels).sum().item() / len(labels)print(accuracy:, accuracy)
test(model) 0
10
20
30
accuracy: 0.693359375 7 训练代码
from transformers import AdamW #AdamW梯度下降的优化算法
from transformers.optimization import get_scheduler #学习率的衰减计算#设置设备、
device torch.device(cuda:0 if torch.cuda.is_available() else cpu)
device device(typecuda, index0) #训练代码
def train():#模型训练时需要梯度下降、学习更新模型参数以达到最好的预测效果#定义优化器optimizer AdamW(model.parameters(),betas(0.9, 0.999), eps1e-8, lr2e-5) #betas/eps/lr都是默认值#学习率衰减计划scheduler get_scheduler(namelinear,num_warmup_steps0, #无预热缓冲区从一开始就衰减num_training_stepslen(loader),optimizeroptimizer)#将模型发送到设备上model.to(device)model.train() #模型训练模式for i,data in enumerate(loader):#接收需要输入的数据input_ids, attention_mask, labels data[input_ids], data[attention_mask], data[labels]#将数据传到设备上input_ids, attention_mask, labels input_ids.to(device), attention_mask.to(device), labels.to(device)#将这些数据传到设备上的模型获取输出值out一个字典包含loss和logits类别概率out model(input_idsinput_ids, attention_maskattention_mask, labelslabels)#从out中获取lossloss out[loss] #字典key索引#用损失函数进行反向传播loss.backward()#为了梯度下降的稳定性使用梯度裁剪torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) #公式中的c1.0#梯度更新optimizer.step()scheduler.step() #学习率衰减计划更新#梯度清零optimizer.zero_grad()model.zero_grad()if i% 50 0:lr optimizer.state_dict()[param_groups][0][lr]#计算预测类别概率的最大值out out[logits].argmax(dim1)#计算准确率accuracy (labelsout).sum().item() / 8 #batch_size8print(i, loss.item(), lr, accuracy)print()
train() 0 0.6603636145591736 1.9981273408239703e-05 0.7550 0.6770923733711243 1.9044943820224723e-05 0.625100 0.5856966972351074 1.810861423220974e-05 0.75150 0.5937663316726685 1.7172284644194758e-05 0.75200 0.5329931974411011 1.6235955056179777e-05 0.75250 0.47660014033317566 1.5299625468164797e-05 0.875300 0.22391566634178162 1.4363295880149814e-05 0.875350 0.2534029185771942 1.3426966292134834e-05 1.0400 0.5150715112686157 1.2490636704119851e-05 0.75450 0.5376325845718384 1.155430711610487e-05 0.75500 0.48840606212615967 1.0617977528089888e-05 0.875550 0.40059715509414673 9.681647940074908e-06 0.875600 0.679754376411438 8.745318352059925e-06 0.75650 0.21557165682315826 7.808988764044945e-06 0.875700 0.6123908758163452 6.872659176029963e-06 0.75750 0.4683417081832886 5.936329588014982e-06 0.75800 0.38990333676338196 5e-06 0.875850 0.43256130814552307 4.063670411985019e-06 0.75900 0.32022809982299805 3.1273408239700374e-06 0.875950 0.9173805713653564 2.1910112359550564e-06 0.6251000 0.42855364084243774 1.2546816479400751e-06 0.8751050 0.4637509882450104 3.183520599250937e-07 0.75 #训练完模型再次测试
test(model.to(cpu)) #因为测试的数据都在cpu上需要把在gpu上训练的模型发到cpu上 0
10
20
30
accuracy: 0.779296875