内容前置
本系列下内容为跟随b站课程总结
课程链接:【手把手带你实战HugingFace Transformers】
正文
先导入所需包
import pandas as pd
import torch
from torch.optim import Adam
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification
然后导入下所需模型,这里我用的是IDEA-CCNL/Erlangshen-Roberta-110M-Sentiment
tokenizer = AutoTokenizer.from_pretrained("IDEA-CCNL/Erlangshen-Roberta-110M-Sentiment")
model = AutoModelForSequenceClassification.from_pretrained("IDEA-CCNL/Erlangshen-Roberta-110M-Sentiment")
if torch.cuda.is_available():
model.cuda()
这里定义下我的数据集,用的是ChnSentiCorp_htl_all.csv,划分比例为0.9/0.1
class MyDataset(Dataset):
def __init__(self):
super().__init__()
self.data = pd.read_csv("ChnSentiCorp_htl_all.csv").dropna() # 读取和清理数据
def __getitem__(self, item):
return self.data.iloc[item]["review"], self.data.iloc[item]["label"] # 文本和标签顺序
def __len__(self):
return len(self.data)
dataset = MyDataset()
trainset, validset = random_split(dataset, lengths=[int(len(dataset) * 0.9), len(dataset) - int(len(dataset) * 0.9)])
定义批处理函数,dataloader和优化器adma
def trans(batch):
texts, labels = [], []
for item in batch:
texts.append(item[0]) # 提取文本
labels.append(item[1]) # 提取标签
inputs = tokenizer(
texts,
max_length=128,
padding="max_length",
truncation=True,
return_tensors="pt" # 注意这里是复数
)
inputs["labels"] = torch.tensor(labels, dtype=torch.long) # 转为 Tensor
return inputs
trainloader = DataLoader(trainset, batch_size=8, shuffle=True, collate_fn=trans)
validloader = DataLoader(validset, batch_size=8, shuffle=False, collate_fn=trans)
optimizer = Adam(model.parameters(), lr=2e-5)
接下来,定义模型的训练和推理函数进行训练,以及模型的保存
def eval():
model.eval()
acc = 0
with torch.no_grad(): # 推理模式
for batch in validloader:
if torch.cuda.is_available():
batch = {k: v.cuda() for k, v in batch.items()}
output = model(**batch)
pred = torch.argmax(output.logits, dim=-1)
acc += (pred == batch["labels"]).float().sum().item() # 累计正确预测数
return acc / len(validset)
def train(epoch=5, log_step=100):
global_step = 0
for ep in range(epoch):
model.train()
for batch in trainloader:
if torch.cuda.is_available():
batch = {k: v.cuda() for k, v in batch.items()}
optimizer.zero_grad()
output = model(**batch)
output.loss.backward()
optimizer.step()
if global_step % log_step == 0:
print(f"ep: {ep}, global_step: {global_step}, loss: {output.loss.item():.4f}")
global_step += 1
acc = eval()
print(f"ep: {ep}, acc: {acc:.4f}")
train(epoch=2)
# 保存模型和分词器
save_directory = "./trained_model" # 指定保存路径
model.save_pretrained(save_directory) # 保存模型权重和配置
tokenizer.save_pretrained(save_directory) # 保存分词器配置和词表
运行结果如下
ep: 0, global_step: 0, loss: 0.0558
ep: 0, global_step: 100, loss: 0.1422
ep: 0, global_step: 200, loss: 0.3370
ep: 0, global_step: 300, loss: 0.0450
ep: 0, global_step: 400, loss: 0.0778
ep: 0, global_step: 500, loss: 0.1903
ep: 0, global_step: 600, loss: 0.0286
ep: 0, global_step: 700, loss: 0.1090
ep: 0, global_step: 800, loss: 0.3029
ep: 0, acc: 0.9073
ep: 1, global_step: 900, loss: 0.0488
ep: 1, global_step: 1000, loss: 0.0226
ep: 1, global_step: 1100, loss: 0.0261
ep: 1, global_step: 1200, loss: 0.0269
ep: 1, global_step: 1300, loss: 0.0824
ep: 1, global_step: 1400, loss: 0.0023
ep: 1, global_step: 1500, loss: 0.0585
ep: 1, global_step: 1600, loss: 0.2029
ep: 1, global_step: 1700, loss: 0.1317
ep: 1, acc: 0.9292
('./trained_model\\tokenizer_config.json',
'./trained_model\\special_tokens_map.json',
'./trained_model\\vocab.txt',
'./trained_model\\added_tokens.json',
'./trained_model\\tokenizer.json')
这里说下自己学到的一些东西:hugingface中Automodel只是调用模型基座,并没有输出头,需要如ForSequenceClassifica等具体任务的输出头拼接,方能达到预期效果,如只调用Automodel则只会输出last hidden state,可以在调用模型时调整是否输出attention等等:具体代码为 model=Automodel.from_pretrained("./../",output_attention=True),对于分类任务等,我们可以利用上文定义trans,直接吧label传入inputs中,这样model中包含loss便会自动计算,下面训练时也只需model.loss.backward()
模型的具体参数可以通过:
model.config
结果为
BertConfig {
"_name_or_path": "trained_model",
"architectures": [
"BertForSequenceClassification"
],
"attention_probs_dropout_prob": 0.1,
"bos_token_id": 0,
"classifier_dropout": null,
"directionality": "bidi",
"eos_token_id": 2,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"id2label": {
"0": "Negative",
"1": "Positive"
},
"initializer_range": 0.02,
"intermediate_size": 3072,
"label2id": null,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 512,
"model_type": "bert",
"num_attention_heads": 12,
"num_hidden_layers": 12,
...
"transformers_version": "4.43.1",
"type_vocab_size": 2,
"use_cache": true,
"vocab_size": 21128
}
Output is truncated. View as a scrollable element or open in a text editor. Adjust cell output settings...
可以根据自己需求做相应修改,如num_head等。
这里我们写个测试函数
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("trained_model")
tokenizer = AutoTokenizer.from_pretrained("trained_model")
sentence = "环境还可以,服务态度也不错,值得再来"
#sentence = "环境不太行,早餐味道也一般,电视不能投屏"
model.eval()
with torch.inference_mode():
sen = tokenizer(sentence, return_tensors="pt")
if torch.cuda.is_available():
sen = {k: v.cuda() for k, v in sen.items()}
model = model.cuda() # 确保模型也在 GPU 上
print(model(**sen))
logits = model(**sen).logits
pred = torch.argmax(logits, dim=-1)
print(f"输入 :{sentence}, 输出 :{model.config.id2label.get(pred.item())}")
输出结果为
SequenceClassifierOutput(loss=None, logits=tensor([[-3.8532, 2.8551]], device='cuda:0'), hidden_states=None, attentions=None)
输入 :环境还可以,服务态度也不错,值得再来, 输出 :Positive
那么对这个小模型所做的 sequence classification的全量微调到这就结束了,后续将更新一些其他微调内容及huggingface上组件应用。