飞道的博客

基于Bert-Lstm-Crf的命名实体识别(PyTorch 实现)

575人阅读  评论(0)

1 前言

 1-1 简介

         命名实体识别(NER)是自然语言处理的基础任务,处于工程性任务的上游,很多的下游任务依赖于命名实体识别的效果,所以命名实体识别也是自然语言处理中非常重要的一环。命名实体识别的任务非常简单,给出一段文本,识别出文本中的人名、机构名称、歌曲名称、专辑名称、地点名称等名词(或者称之为实体)。以下给出百度百科界定的命名实体识别的概念。 

 1-2 任务背景

        由于下游任务是接入“KBQA[基于知识图谱的问答]”、“实体链指”所以对于输入文本要识别出其中存在的实体。任务中的实体类型只有一种,不对实体类别做区分,只需要识别出实体即可。

 1-3 数据来源

        数据来源于百度百科的词条数据,对百科词条进行处理,由于百科词条中会对可以链指的实体进行“飘蓝”处理,经过解析可以得到一段文本中包含的实体。处理后的训练数据如下所示。


  
  1. { "text": "三十不惑,来源于中国春秋时期一部语录体散文集《论语》,原出处的语句是:“三十而立,四十而不惑”。其意就是三十岁进入了而立之年的意思。", "label": { "entity": { "春秋时期": [[10, 13]], "论语": [[23, 24]]}}}
  2. { "text": "博科FWS648G-EPREM是博科品牌下的一款交换机。", "label": { "entity": { "交换机": [[24, 26]]}}}

2 任务实现


  
  1. #导入必要的库
  2. import os
  3. import json
  4. import numpy as np
  5. import torch
  6. from transformers.models.bert.modeling_bert import *
  7. from tqdm import tqdm
  8. import NER_config
  9. import jsonlines
  10. import transformers
  11. import torch.nn as nn
  12. from loguru import logger
  13. from sklearn.metrics import accuracy_score,recall_score,f1_score
  14. from transformers.optimization import get_cosine_schedule_with_warmup, AdamW
  15. from torch.utils.data import DataLoader
  16. from transformers import BertTokenizer
  17. from torch.utils.data import Dataset
  18. from torch.nn.utils.rnn import pad_sequence
  19. from torchcrf import CRF
  20. from sklearn.model_selection import train_test_split
  21. import warnings
  22. warnings.filterwarnings( 'ignore')
  23. #生成日志
  24. log_path = "/home/zhenhengdong/WORk/NER/Baidu_NER/Log/"
  25. logger.add(log_path + 'Train.log', format= "{time} {level} {message}", level= "INFO")

 NER_config.py


  
  1. import os
  2. import torch
  3. robert_model = '/ssd/Spider/Baidu_NER/Pre_Model/chinese_roberta_wwm_large_ext/'
  4. model_dir = '/home/zhenhengdong/WORk/NER/Baidu_NER/Model/Baidu_ner_model2.pkl'
  5. # 训练集、验证集划分比例
  6. dev_split_size = 0.1
  7. # 是否加载训练好的NER模型
  8. load_before = False
  9. #指定device
  10. device = torch.device( 'cuda:1' if torch.cuda.is_available() else 'cpu')
  11. # 是否对整个BERT进行fine tuning
  12. full_fine_tuning = True
  13. # hyper-parameter
  14. learning_rate = 3e-5
  15. weight_decay = 0.01
  16. clip_grad = 5
  17. batch_size = 10
  18. epoch_num = 150
  19. min_epoch_num = 5
  20. patience = 0.0002
  21. patience_num = 10
  22. labels = [ 'entity']
  23. label2id = {
  24. "O": 0,
  25. "B-entity": 1,
  26. "I-entity": 2,
  27. "E-entity": 3,
  28. "S-entity": 4,
  29. }
  30. id2label = {_ id: _label for _label, _ id in list(label2id.items())}
  31. #BertNER的超参数,也可以设置在预训练模型的config中
  32. #num_labels = len(label2id)
  33. #hidden_dropout_prob = 0.3
  34. #lstm_embedding_size = 768
  35. #hidden_size = 1024
  36. #lstm_dropout_prob = 0.5

2-1.数据处理

        在数据处理阶段,将数据格式处理为B-entity、I-entity、E-entity、S-entity、O。其中B-entity表示实体的开头、I-entity表示实体的中间、E-entity表示实体的结束、S-entity表示单个实体、O表示文本中的其他成分。具体代码如下。


  
  1. #数据处理
  2. def Data_preprocess( input_filename,output_filename):
  3. count = 0
  4. word_list = []
  5. label_list = []
  6. with open(input_filename, 'r') as reader:
  7. lines = reader.readlines()
  8. random_list = []
  9. #选取12000条数据
  10. for _ in tqdm( range( 12000)): #12000
  11. #设定随机值,进行随机选取
  12. random_index = random.randint( 1, 4495464) #测试499497 #训练集4495465
  13. if random_index not in random_list:
  14. random_list.append(random_index)
  15. json_line = json.loads(lines[random_index].strip())
  16. text = json_line[ 'text']
  17. #设定了选取长度
  18. if len(text) <= 510:
  19. words = list(text)
  20. label_entity = json_line.get( 'label', None)
  21. #label先全部设为"O"
  22. label = [ 'O'] * len(words)
  23. #判断如果不等于None
  24. if label_entity is not None:
  25. count += 1
  26. for key,value in label_entity.items():
  27. for sub_name,sub_index in value.items():
  28. for start_index,end_index in sub_index:
  29. #判断是否超出边界,做一个判断
  30. if ''.join(words[start_index:end_index + 1]) == sub_name:
  31. #单实体标注S-entity
  32. if start_index == end_index:
  33. label[start_index] = 'S-' + key
  34. else:
  35. #多字实体采用B-entity I-entity E-entity 的标注方式
  36. label[start_index] = "B-" + key
  37. label[start_index + 1:end_index + 1] = [ 'I-' + key] * ( len(sub_name) - 1 )
  38. label[end_index] = 'E-' + key
  39. word_list.append(words)
  40. label_list.append(label)
  41. else:
  42. continue
  43. print( len(word_list), len(label_list))
  44. #保存成二进制文件
  45. np.savez_compressed(output_filename,words = word_list, lables = label_list)
  46. #统计处理数量
  47. print(count)

2-1-1 生成二进制文件 

        将原始数据DataSet按照9:1的比例进行划分,划分为train.jsonl和test.jsonl。然后对训练数据和测试数据进行数据处理。


  
  1. train_input = '/ssd/Spider/Baidu_NER/DataSets/Ori_data/train.jsonl'
  2. train_output = '/ssd/Spider/Baidu_NER/DataSets/Binary_file/train.npz'
  3. word_list,label_list = Data_preprocess(train_input,train_output)

  
  1. test_input = '/ssd/Spider/Baidu_NER/DataSets/Ori_data/test.jsonl'
  2. test_output = '/ssd/Spider/Baidu_NER/DataSets/Binary_file/test.npz'
  3. Data_preprocess(test_input,test_output)

   2-1-2 标注结果实例


  
  1. words示例:[ '聚', '叶', '黔', '川', '乌', '头', '(', '变', '种', ')', '是', '四', '川', '北', '部', '青', '川', '的', '植', '物', '。']
  2. labels示例:[ 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-entity', 'E-entity', 'O', 'O', 'O', 'O']

2-2 数据转化形式

        上一步将原始数据中的实体对应位置贴入标签并转换为二进制存储,现在需要对数据进行编码,处理成id的形式。具体代码实现如下。


  
  1. class NERDataset( Dataset):
  2. def __init__( self, words, labels, config, word_pad_idx=0, label_pad_idx=-1):
  3. self.tokenizer = BertTokenizer.from_pretrained(config.robert_model, do_lower_case= True)
  4. self.label2id = config.label2id
  5. self.id2label = {_ id: _label for _label, _ id in list(config.label2id.items())}
  6. self.dataset = self.preprocess(words, labels)
  7. self.word_pad_idx = word_pad_idx
  8. self.label_pad_idx = label_pad_idx
  9. self.device = config.device
  10. def preprocess( self, origin_sentences, origin_labels):
  11. data = []
  12. sentences = []
  13. labels = []
  14. for line in tqdm(origin_sentences):
  15. # replace each token by its index
  16. # we can not use encode_plus because our sentences are aligned to labels in list type
  17. words = []
  18. word_lens = []
  19. for token in line:
  20. #bert对字进行编码转化为id表示
  21. words.append(self.tokenizer.tokenize(token))
  22. word_lens.append( len(token))
  23. # 变成单个字的列表,开头加上[CLS]
  24. words = [ '[CLS]'] + [item for token in words for item in token]
  25. token_start_idxs = 1 + np.cumsum([ 0] + word_lens[:- 1])
  26. sentences.append((self.tokenizer.convert_tokens_to_ids(words), token_start_idxs))
  27. for tag in origin_labels:
  28. label_id = [self.label2id.get(t) for t in tag]
  29. labels.append(label_id)
  30. for sentence, label in zip(sentences, labels):
  31. if len(sentence[ 0]) - len(label) == 1:
  32. data.append((sentence, label))
  33. return data
  34. def __getitem__( self, idx):
  35. """sample data to get batch"""
  36. word = self.dataset[idx][ 0]
  37. label = self.dataset[idx][ 1]
  38. return [word, label]
  39. def __len__( self):
  40. return len(self.dataset)
  41. def collate_fn( self, batch):
  42. sentences = [x[ 0] for x in batch]
  43. labels = [x[ 1] for x in batch]
  44. # batch length
  45. batch_len = len(sentences)
  46. # compute length of longest sentence in batch
  47. max_len = max([ len(s[ 0]) for s in sentences])
  48. max_label_len = 0 # 改动前max_label_len = 0
  49. # padding data 初始化
  50. batch_data = self.word_pad_idx * np.ones((batch_len, max_len))
  51. batch_label_starts = []
  52. # padding and aligning
  53. for j in range(batch_len):
  54. cur_len = len(sentences[j][ 0])
  55. batch_data[j][:cur_len] = sentences[j][ 0]
  56. # 找到有标签的数据的index([CLS]不算)
  57. label_start_idx = sentences[j][- 1]
  58. label_starts = np.zeros(max_len)
  59. label_starts[[idx for idx in label_start_idx if idx < max_len]] = 1
  60. batch_label_starts.append(label_starts)
  61. max_label_len = max( int( sum(label_starts)), max_label_len)
  62. # padding label
  63. batch_labels = self.label_pad_idx * np.ones((batch_len, max_label_len))
  64. for j in range(batch_len):
  65. cur_tags_len = len(labels[j])
  66. batch_labels[j][:cur_tags_len] = labels[j]
  67. # convert data to torch LongTensors
  68. batch_data = torch.tensor(batch_data, dtype=torch.long)
  69. batch_label_starts = torch.tensor(batch_label_starts, dtype=torch.long)
  70. batch_labels = torch.tensor(batch_labels, dtype=torch.long)
  71. # shift tensors to GPU if available
  72. batch_data, batch_label_starts = batch_data.to(self.device), batch_label_starts.to(self.device)
  73. batch_labels = batch_labels.to(self.device)
  74. return [batch_data, batch_label_starts, batch_labels]

2-3 定义模型结构

        模型设计环节,使用Bert作为底层的特征提取器,并加入双向lstm与线性层进行分类获得每个标签的预测类别,最后将其送入到crf中,根据发射分数和状态转移矩阵获得最佳的标签类别。模型结构设计如下所示。

代码如下所示: 


  
  1. class BertNER( BertPreTrainedModel):
  2. def __init__( self, config):
  3. super(BertNER, self).__init__(config)
  4. #定义分类类别,也可以写在加载预训练模型的config文件中
  5. self.num_labels = 5
  6. self.bert = BertModel(config)
  7. self.dropout = nn.Dropout(config.hidden_dropout_prob)
  8. self.bilstm = nn.LSTM(
  9. input_size=config.lstm_embedding_size, # 1024
  10. hidden_size=config.hidden_size // 2, # 1024
  11. batch_first= True,
  12. num_layers= 2,
  13. dropout=config.lstm_dropout_prob, # 0.5
  14. bidirectional= True
  15. )
  16. self.classifier = nn.Linear(config.hidden_size, self.num_labels)
  17. self.crf = CRF(self.num_labels, batch_first= True)
  18. self.init_weights()
  19. def forward( self, input_data, token_type_ids=None, attention_mask=None, labels=None,
  20. position_ids=None, inputs_embeds=None, head_mask=None):
  21. input_ids, input_token_starts = input_data
  22. outputs = self.bert(input_ids,
  23. attention_mask=attention_mask,
  24. token_type_ids=token_type_ids,
  25. position_ids=position_ids,
  26. head_mask=head_mask,
  27. inputs_embeds=inputs_embeds)
  28. sequence_output = outputs[ 0]
  29. # 去除[CLS]标签等位置,获得与label对齐的pre_label表示
  30. origin_sequence_output = [layer[starts.nonzero().squeeze( 1)]
  31. for layer, starts in zip(sequence_output, input_token_starts)]
  32. # 将sequence_output的pred_label维度padding到最大长度
  33. padded_sequence_output = pad_sequence(origin_sequence_output, batch_first= True)
  34. # dropout pred_label的一部分feature
  35. padded_sequence_output = self.dropout(padded_sequence_output)
  36. #将结果送入bilstm,再次提取特性
  37. lstm_output, _ = self.bilstm(padded_sequence_output)
  38. # 将lstm的结果送入线性层,进行五分类
  39. logits = self.classifier(lstm_output)
  40. outputs = (logits,)
  41. if labels is not None:
  42. loss_mask = labels.gt(- 1)
  43. #将每个标签的概率送入到crf中进行解码,并获得loss
  44. loss = self.crf(logits, labels, loss_mask) * (- 1)
  45. outputs = (loss,) + outputs
  46. # contain: (loss), scores
  47. return outputs

2-4 定义训练函数 


  
  1. #定义训练函数
  2. def train_epoch( train_loader, model, optimizer, scheduler, epoch):
  3. # 设定训练模式
  4. model.train()
  5. train_losses = 0
  6. for idx, batch_samples in enumerate(tqdm(train_loader)):
  7. batch_data, batch_token_starts, batch_labels = batch_samples
  8. batch_masks = batch_data.gt( 0) # get padding mask
  9. # 计算损失值
  10. loss = model((batch_data, batch_token_starts),
  11. token_type_ids= None, attention_mask=batch_masks, labels=batch_labels)[ 0]
  12. train_losses += loss.item()
  13. #梯度更新
  14. model.zero_grad()
  15. loss.backward()
  16. # 梯度裁剪
  17. nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=NER_config.clip_grad)
  18. # 计算梯度
  19. optimizer.step()
  20. scheduler.step()
  21. train_loss = float(train_losses) / len(train_loader)
  22. logger.info( "Epoch: {}, train loss: {}",epoch, train_loss)

2-5 定义验证函数 


  
  1. #根据预测值和真实值计算评价指标
  2. def compute_acc_recall( batch_output,batch_tags):
  3. acc = 0
  4. recall = 0
  5. f1 = 0
  6. for index in range( len(batch_output)):
  7. acc += accuracy_score(batch_output[index],batch_tags[index])
  8. recall += recall_score(batch_output[index],batch_tags[index],average= 'macro')
  9. f1 += f1_score(batch_output[index],batch_tags[index],average= 'macro')
  10. return (acc/ len(batch_output),recall/ len(batch_output),f1/ len(batch_output))
  11. #定义验证函数
  12. def evaluate( dev_loader, model, mode='dev'):
  13. # 设置为模型为验证模式
  14. model. eval()
  15. if mode == 'test':
  16. tokenizer = BertTokenizer.from_pretrained(NER_config.robert_model, do_lower_case= True, skip_special_tokens= True)
  17. id2label = NER_config.id2label
  18. true_tags = []
  19. pred_tags = []
  20. sent_data = []
  21. dev_losses = 0
  22. with torch.no_grad():
  23. for idx, batch_samples in tqdm( enumerate(dev_loader)):
  24. batch_data, batch_token_starts, batch_tags = batch_samples
  25. if mode == 'test':
  26. sent_data.extend([[tokenizer.convert_ids_to_tokens(idx.item()) for idx in indices
  27. if (idx.item() > 0 and idx.item() != 101)] for indices in batch_data])
  28. batch_masks = batch_data.gt( 0) # get padding mask, gt(x): get index greater than x
  29. label_masks = batch_tags.gt(- 1) # get padding mask, gt(x): get index greater than x
  30. # compute model output and loss
  31. loss = model((batch_data, batch_token_starts),
  32. token_type_ids= None, attention_mask=batch_masks, labels=batch_tags)[ 0]
  33. dev_losses += loss.item()
  34. # (batch_size, max_len, num_labels)
  35. batch_output = model((batch_data, batch_token_starts),
  36. token_type_ids= None, attention_mask=batch_masks)[ 0]
  37. # (batch_size, max_len - padding_label_len)
  38. batch_output = model.crf.decode(batch_output, mask=label_masks)
  39. # (batch_size, max_len)
  40. batch_tags = batch_tags.to( 'cpu').numpy()
  41. pred_tags.extend([[idx for idx in indices] for indices in batch_output])
  42. # (batch_size, max_len - padding_label_len)
  43. true_tags.extend([[idx for idx in indices if idx > - 1] for indices in batch_tags])
  44. #pred_tags.extend([[id2label.get(idx) for idx in indices] for indices in batch_output])
  45. # (batch_size, max_len - padding_label_len)
  46. #true_tags.extend([[id2label.get(idx) for idx in indices if idx > -1] for indices in batch_tags])
  47. assert len(pred_tags) == len(true_tags)
  48. # logging loss, f1 and report
  49. metrics = {}
  50. acc , recall, F1= compute_acc_recall(true_tags,pred_tags)
  51. metrics[ 'acc'] = acc
  52. metrics[ 'recal'] = recal
  53. metrics[ 'f1'] = F1
  54. metrics[ 'loss'] = float(dev_losses) / len(dev_loader)
  55. return metrics

2-6 定义测试函数


  
  1. def test( NER_config):
  2. data = np.load(NER_config.test_dir, allow_pickle= True)
  3. word_test = data[ "words"]
  4. label_test = data[ "labels"]
  5. test_dataset = NERDataset(word_test, label_test, NER_config)
  6. # build data_loader
  7. test_loader = DataLoader(test_dataset, batch_size=NER_config.batch_size,
  8. shuffle= False, collate_fn=test_dataset.collate_fn)
  9. # Prepare model
  10. if config.model_dir is not None:
  11. model = BertNER.from_pretrained(NER_config.model_dir)
  12. model.to(NER_config.device)
  13. val_metrics = evaluate(test_loader, model, mode= 'test')
  14. logging.info( "test loss: {}, f1 score: {}". format(val_metrics[ 'loss'], val_metrics[ 'F1']))

2-7 训练与验证


  
  1. def train( train_loader, dev_loader, model, optimizer, scheduler, model_dir):
  2. """train the model and test model performance"""
  3. # reload weights from restore_dir if specified
  4. best_val_f1 = 0.0
  5. patience_counter = 0
  6. # start training
  7. for epoch in range( 1, NER_config.epoch_num + 1):
  8. train_epoch(train_loader, model, optimizer, scheduler, epoch)
  9. #开始验证
  10. val_metrics = evaluate(dev_loader, model, mode= 'dev')
  11. val_f1 = val_metrics[ 'f1']
  12. logger.info( "Epoch: {}, dev loss: {}, f1 score: {}",epoch, val_metrics[ 'loss'], val_f1)
  13. improve_f1 = val_f1 - best_val_f1
  14. if improve_f1 > 1e-5:
  15. best_val_f1 = val_f1
  16. #模型保存需要更改
  17. torch.save(model,model_dir)
  18. logger.info( "--------Save best model!--------")
  19. if improve_f1 < NER_config.patience:
  20. patience_counter += 1
  21. else:
  22. patience_counter = 0
  23. else:
  24. patience_counter += 1
  25. # Early stopping and logging best f1
  26. if (patience_counter >= NER_config.patience_num and epoch > NER_config.min_epoch_num) or epoch == NER_config.epoch_num:
  27. logger.info( "Best val f1: {}",best_val_f1)
  28. break
  29. logger.info( "Training Finished!")

2-8 划分训练集、验证集 


  
  1. def dev_split( dataset_dir):
  2. """从训练集合中划分验证集和训练集"""
  3. data = np.load(dataset_dir, allow_pickle= True)
  4. words = data[ "words"]
  5. labels = data[ "lables"]
  6. x_train, x_dev, y_train, y_dev = train_test_split(words, labels, test_size= 0.01, random_state= 0)
  7. return x_train, x_dev, y_train, y_dev

2-9 模型训练函数


  
  1. def run( config):
  2. """train the model"""
  3. # 处理数据,
  4. # 分离训练集、验证集
  5. word_train, word_dev, label_train, label_dev = load_dev( 'train')
  6. # 创建dataset
  7. train_dataset = NERDataset(word_train, label_train, config)
  8. dev_dataset = NERDataset(word_dev, label_dev, config)
  9. # get dataset size
  10. train_size = len(train_dataset)
  11. # 创建dataloader
  12. train_loader = DataLoader(train_dataset, batch_size=config.batch_size,
  13. shuffle= True, collate_fn=train_dataset.collate_fn)
  14. dev_loader = DataLoader(dev_dataset, batch_size=config.batch_size,
  15. shuffle= True, collate_fn=dev_dataset.collate_fn)
  16. # 实例化模型
  17. device = config.device
  18. model = BertNER.from_pretrained(config.roberta_model, num_labels= len(config.label2id))
  19. model.to(device)
  20. # Prepare optimizer
  21. if config.full_fine_tuning:
  22. # model.named_parameters(): [bert, bilstm, classifier, crf]
  23. bert_optimizer = list(model.bert.named_parameters())
  24. lstm_optimizer = list(model.bilstm.named_parameters())
  25. classifier_optimizer = list(model.classifier.named_parameters())
  26. no_decay = [ 'bias', 'LayerNorm.bias', 'LayerNorm.weight']
  27. optimizer_grouped_parameters = [
  28. { 'params': [p for n, p in bert_optimizer if not any(nd in n for nd in no_decay)],
  29. 'weight_decay': config.weight_decay},
  30. { 'params': [p for n, p in bert_optimizer if any(nd in n for nd in no_decay)],
  31. 'weight_decay': 0.0},
  32. { 'params': [p for n, p in lstm_optimizer if not any(nd in n for nd in no_decay)],
  33. 'lr': config.learning_rate * 5, 'weight_decay': config.weight_decay},
  34. { 'params': [p for n, p in lstm_optimizer if any(nd in n for nd in no_decay)],
  35. 'lr': config.learning_rate * 5, 'weight_decay': 0.0},
  36. { 'params': [p for n, p in classifier_optimizer if not any(nd in n for nd in no_decay)],
  37. 'lr': config.learning_rate * 5, 'weight_decay': config.weight_decay},
  38. { 'params': [p for n, p in classifier_optimizer if any(nd in n for nd in no_decay)],
  39. 'lr': config.learning_rate * 5, 'weight_decay': 0.0},
  40. { 'params': model.crf.parameters(), 'lr': config.learning_rate * 5}
  41. ]
  42. # only fine-tune the head classifier
  43. else:
  44. param_optimizer = list(model.classifier.named_parameters())
  45. optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer]}]
  46. optimizer = AdamW(optimizer_grouped_parameters, lr=config.learning_rate, correct_bias= False)
  47. train_steps_per_epoch = train_size // config.batch_size
  48. scheduler = get_cosine_schedule_with_warmup(optimizer,
  49. num_warmup_steps=(config.epoch_num // 10) * train_steps_per_epoch,
  50. num_training_steps=config.epoch_num * train_steps_per_epoch)
  51. # Train the model
  52. logging.info( "--------Start Training!--------")
  53. train(train_loader, dev_loader, model, optimizer, scheduler, config.model_dir)

查看dataset 

 2-10 训练与测试


  
  1. if __name__ == '__main__':
  2. run(NER_config)
  3. test(NER_config)

2-11 定义推断函数


  
  1. #定义推断函数
  2. def infer_function( dev_loader, model, mode='dev'):
  3. # set model to evaluation mode
  4. model. eval()
  5. if mode == 'test':
  6. tokenizer = BertTokenizer.from_pretrained(NER_config.robert_model, do_lower_case= True, skip_special_tokens= True)
  7. id2label = NER_config.id2label
  8. true_tags = []
  9. pred_tags = []
  10. sent_data = []
  11. dev_losses = 0
  12. with torch.no_grad():
  13. for idx, batch_samples in enumerate(dev_loader):
  14. batch_data, batch_token_starts, batch_tags = batch_samples
  15. if mode == 'test':
  16. sent_data.extend([[tokenizer.convert_ids_to_tokens(idx.item()) for idx in indices
  17. if (idx.item() > 0 and idx.item() != 101)] for indices in batch_data])
  18. batch_masks = batch_data.gt( 0) # get padding mask, gt(x): get index greater than x
  19. label_masks = batch_tags.gt(- 1) # get padding mask, gt(x): get index greater than x
  20. # compute model output and loss
  21. #loss = model((batch_data, batch_token_starts),
  22. #token_type_ids=None, attention_mask=batch_masks, labels=batch_tags)[0]
  23. #dev_losses += loss.item()
  24. # (batch_size, max_len, num_labels)
  25. batch_output = model((batch_data, batch_token_starts),
  26. token_type_ids= None, attention_mask=batch_masks)[ 0]
  27. # (batch_size, max_len - padding_label_len)
  28. batch_output = model.crf.decode(batch_output, mask=label_masks)
  29. # (batch_size, max_len)
  30. #batch_tags = batch_tags.to('cpu').numpy()
  31. pred_tags.extend([[id2label.get(idx) for idx in indices] for indices in batch_output])
  32. return pred_tags

  
  1. def new_infer( text):
  2. words = list(text)
  3. label = [ 'O'] * len(words)
  4. word_list = []
  5. label_list = []
  6. word_list.append(words)
  7. label_list.append(label)
  8. output_filename = '/home/zhenhengdong/WORk/NER/Try_ner/Datasets/Binary_file/infer.npz'
  9. np.savez_compressed(output_filename,words = word_list, lables = label_list)
  10. #重新加载
  11. data = np.load(output_filename, allow_pickle= True)
  12. word_test = data[ "words"]
  13. label_test = data[ "lables"]
  14. test_dataset = NERDataset(word_test, label_test, NER_config)
  15. # build data_loader
  16. test_loader = DataLoader(test_dataset, batch_size=NER_config.batch_size,
  17. shuffle= False, collate_fn=test_dataset.collate_fn)
  18. # Prepare model
  19. if NER_config.model_dir is not None:
  20. #model = torch.load(NER_config.model_dir)
  21. model = BertNER.from_pretrained(NER_config.model_dir)
  22. model.to(NER_config.device)
  23. logger.info( "--------Load model from {}--------". format(NER_config.model_dir))
  24. else:
  25. logger.info( "--------No model to test !--------")
  26. return
  27. pre_tegs = infer_function(test_loader, model, mode= 'test')
  28. return pre_tegs

2-12 展示预测结果 


  
  1. text = '2022年11月,马拉西亚随荷兰国家队征战2022年卡塔尔世界杯'
  2. pre_tegs = new_infer(text)
  3. #取出位置
  4. start_index_list = []
  5. end_index_list = []
  6. for index in range( len(pre_tegs[ 0])):
  7. if index != 0 and pre_tegs[ 0][index] != 'O' and pre_tegs[ 0][index- 1] == 'O':
  8. start_index = index
  9. start_index_list.append(start_index)
  10. if index != len(pre_tegs[ 0]) - 1 and pre_tegs[ 0][index] != 'O' and pre_tegs[ 0][index+ 1] == 'O':
  11. end_index = index
  12. end_index_list.append(end_index)
  13. if index == 0 and pre_tegs[ 0][index] != 'O' :
  14. start_index = index
  15. start_index_list.append(start_index)
  16. if index == len(pre_tegs[ 0]) - 1 and pre_tegs[ 0][index] != 'O' :
  17. end_index = index
  18. end_index_list.append(end_index)
  19. #展示
  20. for index in range( len(start_index_list)):
  21. print(text[start_index_list[index]:end_index_list[index]+ 1])

写在最后

        作为一名初入职的算法工程师,越发觉得数据的重要性。数据样本的均衡、数据的多样性远比更换模型、调整参数来的实在。普通的数据使用LSTM与Attention的组合基本上就能达到性能要求,复杂的数据使用预训练模型也可以搞定。工业中,一个好的算法工程师肯定是以一名好的数据分析师为前提的,训练模型前要对自己的数据做到充分的分析。就比如在这次任务中,初次使用B、I、O、S进行标注后,发现效果并没有预期的好,预测之后发现会有边界溢出问题。其中有两套解决方案,一个方案是更换模型,尝试使用span指针网络进行标注,另一个方案是对标注数据标签进行更改,添加E表示实体的结束。显然使用第二种方案可以更快的达到目的,在添加标签后,边界溢出问题解决。

Reference:

(1)GitHub - hemingkx/CLUENER2020: A PyTorch implementation of a BiLSTM\BERT\Roberta(+CRF) model for Named Entity Recognition.

(2)BiLSTM模型中CRF层的运行原理(2) | 闲记算法

        


转载:https://blog.csdn.net/weixin_44750512/article/details/128460220
查看评论
* 以上用户言论只代表其个人观点,不代表本网站的观点或立场