16,820
个编辑
更改
无编辑摘要
'房产')
label2id = dict() id2label = dict() for i, label in enumerate(set(y_lst)): label2id[label] = i id2label[i] = label tokenizer = AutoTokenizer.from_pretrained("./models/bert-base-chinese")
先把所有的文本都转化为编码,而不是在后续数据集中转化,这样可以避免在后续训练过程中,每一个epoch都要进行转化,提升效率:
token_lens = [] for txt in tqdm(x_lst): tokens = tokenizer.encode(txt, max_length=512) token_lens.append(len(tokens)) 0%| | 0/5900 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.100%|██████████| 5900/5900 [00:07<00:00, 739.64it/s] class NewsDataset(Dataset): def __init__(self,x_lst,y_lst,tokenizer,max_len): self.x_lst=x_lst self.y_lst=y_lst self.tokenizer=tokenizer self.max_len=max_len def __len__(self): return len(self.x_lst) def __getitem__(self,index): """ index 为数据索引,迭代取第index条数据 """ text=str(self.x_lst[index]) label=label2id[self.y_lst[index]] encoding=self.tokenizer.encode_plus( text, add_special_tokens=True, max_length=self.max_len, return_token_type_ids=True, pad_to_max_length=True, return_attention_mask=True, return_tensors='pt', ) return { 'texts':text, 'input_ids':encoding['input_ids'].flatten(), 'attention_mask':encoding['attention_mask'].flatten(), 'labels':torch.tensor(label,dtype=torch.long) } x_train, x_val, y_train, y_val = train_test_split(x_lst, y_lst, test_size=0.15, random_state=RANDOM_SEED) # 划分训练集 测试集 # datasettrain_dataset = NewsDataset(x_train, y_train, tokenizer, MAX_LEN)val_dataset = NewsDataset(x_val, y_val, tokenizer, MAX_LEN) # dataloadertrain_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE)val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
for txt in tqdm(x_lst):
tokens = tokenizer.encode(txt, max_length=512)
token_lens.append(len(tokens))
0%| | 0/5900 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly
truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this
strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 5900/5900 [00:07<00:00, 739.64it/s]
class NewsDataset(Dataset):
def __init__(self,x_lst,y_lst,tokenizer,max_len):
self.x_lst=x_lst
self.y_lst=y_lst
self.tokenizer=tokenizer
self.max_len=max_len
def __len__(self):
return len(self.x_lst)
def __getitem__(self,index):
"""
index 为数据索引,迭代取第index条数据
"""
text=str(self.x_lst[index])
label=label2id[self.y_lst[index]]
encoding=self.tokenizer.encode_plus(
text,
add_special_tokens=True,
max_length=self.max_len,
return_token_type_ids=True,
pad_to_max_length=True,
return_attention_mask=True,
return_tensors='pt',
)
return {
'texts':text,
'input_ids':encoding['input_ids'].flatten(),
'attention_mask':encoding['attention_mask'].flatten(),
'labels':torch.tensor(label,dtype=torch.long)
}
x_train, x_val, y_train, y_val = train_test_split(x_lst, y_lst, test_size=0.15, random_state=RANDOM_SEED) # 划分训练集 测试集
# dataset
train_dataset = NewsDataset(x_train, y_train, tokenizer, MAX_LEN)
val_dataset = NewsDataset(x_val, y_val, tokenizer, MAX_LEN)
# dataloader
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
6.2 自定义网络
这里我们使用BERT预训练模型,同时接Dropout层和一层线形层,构成自定义网络:
class CustomBERTModel(nn.Module):
def __init__(self, n_classes):
super(CustomBERTModel, self).__init__()
self.bert = AutoModel.from_pretrained("./models/bert-base-chinese")
self.drop = nn.Dropout(p=0.3)
self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
def forward(self, input_ids, attention_mask):
_, pooled_output = self.bert(
input_ids=input_ids,
attention_mask=attention_mask,
return_dict = False
)
output = self.drop(pooled_output) # dropout
return self.out(output)
device = set_device(cuda_index=1)
2022-12-20 16:12:39 set_device line 11 out: cuda:1
n_classes = len(label2id)
model = CustomBERTModel(n_classes)
model = model.to(device)
Some weights of the model checkpoint at ./models/bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight',
'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight',
'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a
BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification
model from a BertForSequenceClassification model).
自定义数据集:
if val_acc > best_accuracy:
is_best = True
torch.save(model.state_dict(), './models/news_classification/best_model_state.bin')
best_accuracy = val_acc
else:
is_best = False
t.print_row(epoch, f"{train_acc:.4f}", f"{train_loss:.4f}", f"{val_acc:.4f}", f"{val_loss:.4f}", is_best)
+======+===========+====================+================+===================+===============+=============+
| | epoch | train_accuracy | train_loss | test_accuracy | test_loss | is_best |
+======+===========+====================+================+===================+===============+=============+
| 1 | 0 | 0.6080 | 1.4608 | 0.8893 | 0.5278 | True |
+------+-----------+--------------------+----------------+-------------------+---------------+-------------+
| 2 | 1 | 0.9196 | 0.3766 | 0.9096 | 0.3583 | True |
+------+-----------+--------------------+----------------+-------------------+---------------+-------------+
| 3 | 2 | 0.9589 | 0.2015 | 0.9153 | 0.3413 | True |
+------+-----------+--------------------+----------------+-------------------+---------------+-------------+
| 4 | 3 | 0.9765 | 0.1272 | 0.9153 | 0.3286 | False |
+------+-----------+--------------------+----------------+-------------------+---------------+-------------+
| 5 | 4 | 0.9836 | 0.0919 | 0.9220 | 0.3239 | True |
+------+-----------+--------------------+----------------+-------------------+---------------+-------------+
使用BERT预训练模型+自定义网络,模型初始时就具有了较高的准确率。
出处:https://www.cnblogs.com/chenhuabin/p/16997607.html