[深度学习] 自然语言处理--- 基于Keras Bert使用（上）

2020-04-01 15:16 1181人阅读评论(0)

1. bert ---- keras

keras_bert 是 CyberZHG 封装好了Keras版的Bert，可以直接调用官方发布的预训练权重。

github：https://github.com/CyberZHG/keras-bert

快速安装：pip install keras-bert

bert4keras是封装好了Keras版的Bert，可以直接调用官方发布的预训练权重。（支持tf2）

github：https://github.com/bojone/bert4keras

快速安装：pip install git+https://www.github.com/bojone/bert4keras.git

2.keras_bert

2.1.Tokenizer

在 keras-bert 里面，使用 Tokenizer 会将文本拆分成字并生成相应的id。

我们需要提供一个字典，字典存放着 token 和 id 的映射。字典里还有 BERT 里特别的 token。

[CLS]，[SEP]，[UNK]等

在下面的示例中，如果文本拆分出来的字在字典不存在，它的 id 会是 5，代表 [UNK]，即 unknown

我们用同样的字典，拆分不存在字典中的单词，结果如下，可以看到英语中会直接把不存在字典中的部分直接按字母拆分。

如果输入两句话的例子，encode 函数中我们可以带上参数 max_len，只看文本拆分出来的 max_len 个字

如果拆分完的字不超过max_len，则用 0 填充

2.2.使用预训练模型

参考地址：https://github.com/CyberZHG/keras-bert/tree/master/demo

我们可以使用 load_trained_model_from_checkpoint() 函数使用本地已经下载好的预训练模型，

可以从 BERT 的 github 上获取下载地址

谷歌BERT地址：https://github.com/google-research/bert

中文预训练BERT-wwm：https://github.com/ymcui/Chinese-BERT-wwm

下面是使用预训练模型提取输入文本的特征


  
   
    
     
    
    
     
      import os
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      # 设置预训练模型的路径
     
    
   
    
     
    
    
     
      pretrained_path = 
      'chinese_L-12_H-768_A-12'
     
    
   
    
     
    
    
     
      config_path = os.path.join(pretrained_path, 
      'bert_config.json')
     
    
   
    
     
    
    
     
      checkpoint_path = os.path.join(pretrained_path, 
      'bert_model.ckpt')
     
    
   
    
     
    
    
     
      vocab_path = os.path.join(pretrained_path, 
      'vocab.txt')
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      # 构建字典
     
    
   
    
     
    
    
     
      # keras_bert 中的 load_vocabulary() 函数 传入 vocab_path 即可
     
    
   
    
     
    
    
     
      from keras_bert 
      import load_vocabulary
     
    
   
    
     
    
    
     
      token_dict = load_vocabulary(vocab_path)
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      # import codecs
     
    
   
    
     
    
    
     
      # token_dict = {}
     
    
   
    
     
    
    
     
      # with codecs.open(vocab_path, 'r', 'utf8') as reader:
     
    
   
    
     
    
    
     
      # for line in reader:
     
    
   
    
     
    
    
     
      # token = line.strip()
     
    
   
    
     
    
    
     
      # token_dict[token] = len(token_dict)
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      # Tokenization
     
    
   
    
     
    
    
     
      from keras_bert 
      import Tokenizer
     
    
   
    
     
    
    
     
      tokenizer = Tokenizer(token_dict)
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      # 加载预训练模型
     
    
   
    
     
    
    
     
      from keras_bert 
      import load_trained_model_from_checkpoint
     
    
   
    
     
    
    
     
      model = load_trained_model_from_checkpoint(config_path, checkpoint_path)


  
   
    
     
    
    
     
      text = 
      '语言模型'
     
    
   
    
     
    
    
     
      tokens = tokenizer.tokenize(text)
     
    
   
    
     
    
    
     
      # ['[CLS]', '语', '言', '模', '型', '[SEP]']
     
    
   
    
     
    
    
     
      indices, segments = tokenizer.encode(first=text, max_len=
      512)
     
    
   
    
     
    
    
     
      print(indices[:
      10])
     
    
   
    
     
    
    
     
      print(segments[:
      10])
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      # 提取特征
     
    
   
    
     
    
    
     
      import numpy 
      as np
     
    
   
    
     
    
    
     
      predicts = model.predict([np.array([indices]), np.array([segments])])[
      0]
     
    
   
    
     
    
    
     
      for i, token 
      in enumerate(tokens):
     
    
   
    
     
    
    
     
          print(token, predicts[i].tolist()[:
      5])


  
   
    
     
    
    
     
      text1 = 
      '语言模型'
     
    
   
    
     
    
    
     
      text2 = 
      "你好"
     
    
   
    
     
    
    
     
      tokens1 = tokenizer.tokenize(text1)
     
    
   
    
     
    
    
     
      print(tokens1)
     
    
   
    
     
    
    
     
      tokens2 = tokenizer.tokenize(text2)
     
    
   
    
     
    
    
     
      print(tokens2)
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      indices_new, segments_new = tokenizer.encode(first=text1, second=text2 ,max_len=
      512)
     
    
   
    
     
    
    
     
      print(indices_new[:
      10])
     
    
   
    
     
    
    
     
      # [101, 6427, 6241, 3563, 1798, 102, 0, 0, 0, 0]
     
    
   
    
     
    
    
     
      print(segments_new[:
      10])
     
    
   
    
     
    
    
     
      # [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      # 提取特征
     
    
   
    
     
    
    
     
      import numpy 
      as np
     
    
   
    
     
    
    
     
      predicts_new = model.predict([np.array([indices_new]), np.array([segments_new])])[
      0]
     
    
   
    
     
    
    
     
      for i, token 
      in enumerate(tokens1):
     
    
   
    
     
    
    
     
          print(token, predicts_new[i].tolist()[:
      5])
     
    
   
    
     
    
    
     
      for i, token 
      in enumerate(tokens2):
     
    
   
    
     
    
    
     
          print(token, predicts_new[i].tolist()[:
      5])


  
   
    
     
    
    
     
      #加载语言模型
     
    
   
    
     
    
    
     
      model = load_trained_model_from_checkpoint(config_path, checkpoint_path, training=
      True)
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      token_dict_rev = {v: k 
      for k, v 
      in token_dict.items()}
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      token_ids, segment_ids = tokenizer.encode(
      u'数学是利用符号语言研究数量、结构、变化以及空间等概念的一门学科', max_len=
      512)
     
    
   
    
     
    
    
     
      # mask掉“技术”
     
    
   
    
     
    
    
     
      token_ids[
      1] = token_ids[
      2] = tokenizer._token_dict[
      '[MASK]']
     
    
   
    
     
    
    
     
      masks = np.array([[
      0, 
      1, 
      1] + [
      0] * (
      512 - 
      3)])
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      # 模型预测被mask掉的部分
     
    
   
    
     
    
    
     
      predicts = model.predict([np.array([token_ids]), np.array([segment_ids]), masks])[
      0]
     
    
   
    
     
    
    
     
      pred_indice = probas[
      0][
      1:
      3].argmax(axis=
      1).tolist()
     
    
   
    
     
    
    
     
      print(
      'Fill with: ', list(map(
      lambda x: token_dict_rev[x], pred_indice)))
     
    
   
    
     
    
    
     
      # Fill with: ['数', '学']

3 bert4keras

3.1 函数介绍

keras4bert 是基于 keras-bert 重新编写的一个 keras 版的 bert，

可以适配 albert，只需要在 build_bert_model 函数里加上model='albert'

使用体验和 keras_bert 差不多，下面是 github 提供的使用例子。

SimpleTokenizer是一个简单的分词器，直接将文本分割为单字符序列，专为中文处理设计，原则上只适用于中文模型。

build_bert_model 可用参数如下

config_path：JSON 配置文件路径

checkpoint_file：checkponit 文件路径

with_mlm：是否包含 MLM 部分，默认 False

with_pool：是否包含 POOL 部分，默认 False

with_nsp：是否包含 NSP 部分，默认 False

keep_words：要保留的词ID列表

model：可以定义为 albert 模型, 默认 bert

applications: 'encoder': BertModel, 'seq2seq': Bert4Seq2seq, 'lm': Bert4LM , 默认 encoder

3.2 使用预训练模型

参考地址：https://github.com/bojone/bert4keras/blob/master/examples

我们可以使用 load_trained_model_from_checkpoint() 函数使用本地已经下载好的预训练模型，

可以从 BERT 的 github 上获取下载地址

谷歌BERT地址：https://github.com/google-research/bert

中文预训练BERT-wwm：https://github.com/ymcui/Chinese-BERT-wwm

下面是使用预训练模型提取输入文本的特征


  
   
    
     
    
    
     
      import os
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      # 设置预训练模型的路径
     
    
   
    
     
    
    
     
      pretrained_path = 
      'chinese_L-12_H-768_A-12'
     
    
   
    
     
    
    
     
      config_path = os.path.join(pretrained_path, 
      'bert_config.json')
     
    
   
    
     
    
    
     
      checkpoint_path = os.path.join(pretrained_path, 
      'bert_model.ckpt')
     
    
   
    
     
    
    
     
      vocab_path = os.path.join(pretrained_path, 
      'vocab.txt')
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      from bert4keras.backend 
      import keras, set_gelu
     
    
   
    
     
    
    
     
      from bert4keras.tokenizer 
      import Tokenizer
     
    
   
    
     
    
    
     
      from bert4keras.bert 
      import build_bert_model
     
    
   
    
     
    
    
     
      from bert4keras.optimizers 
      import Adam, extend_with_piecewise_linear_lr
     
    
   
    
     
    
    
     
      from bert4keras.snippets 
      import sequence_padding, DataGenerator
     
    
   
    
     
    
    
     
      from keras.layers 
      import *
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      # import codecs
     
    
   
    
     
    
    
     
      # token_dict = {}
     
    
   
    
     
    
    
     
      # with codecs.open(vocab_path, 'r', 'utf8') as reader:
     
    
   
    
     
    
    
     
      # for line in reader:
     
    
   
    
     
    
    
     
      # token = line.strip()
     
    
   
    
     
    
    
     
      # token_dict[token] = len(token_dict)
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      # 建立分词器
     
    
   
    
     
    
    
     
      tokenizer = Tokenizer(vocab_path)
     
    
   
    
     
    
    
     
      # 构建字典
     
    
   
    
     
    
    
     
      token_dict = tokenizer._token_dict
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      # 加载预训练模型
     
    
   
    
     
    
    
     
      model = build_bert_model(config_path=config_path, checkpoint_path=checkpoint_path)


  
   
    
     
    
    
     
      text = 
      '语言模型'
     
    
   
    
     
    
    
     
      tokens = tokenizer.tokenize(text)
     
    
   
    
     
    
    
     
      # ['[CLS]', '语', '言', '模', '型', '[SEP]']
     
    
   
    
     
    
    
     
      indices, segments = tokenizer.encode(text, max_length=
      512)
     
    
   
    
     
    
    
     
      print(indices[:
      10])
     
    
   
    
     
    
    
     
      print(segments[:
      10])
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      # 提取特征
     
    
   
    
     
    
    
     
      import numpy 
      as np
     
    
   
    
     
    
    
     
      predicts = model.predict([np.array([indices]), np.array([segments])])[
      0]
     
    
   
    
     
    
    
     
      for i, token 
      in enumerate(tokens):
     
    
   
    
     
    
    
     
          print(token, predicts[i].tolist()[:
      5])


  
   
    
     
    
    
     
      text1 = 
      '语言模型'
     
    
   
    
     
    
    
     
      text2 = 
      "你好"
     
    
   
    
     
    
    
     
      tokens1 = tokenizer.tokenize(text1)
     
    
   
    
     
    
    
     
      print(tokens1)
     
    
   
    
     
    
    
     
      tokens2 = tokenizer.tokenize(text2)
     
    
   
    
     
    
    
     
      print(tokens2)
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      indices_new, segments_new = tokenizer.encode(text1, text2 ,max_length=
      512)
     
    
   
    
     
    
    
     
      print(indices_new[:
      10])
     
    
   
    
     
    
    
     
      # [101, 6427, 6241, 3563, 1798, 102, 0, 0, 0, 0]
     
    
   
    
     
    
    
     
      print(segments_new[:
      10])
     
    
   
    
     
    
    
     
      # [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      # 提取特征
     
    
   
    
     
    
    
     
      import numpy 
      as np
     
    
   
    
     
    
    
     
      predicts_new = model.predict([np.array([indices_new]), np.array([segments_new])])[
      0]
     
    
   
    
     
    
    
     
      for i, token 
      in enumerate(tokens1):
     
    
   
    
     
    
    
     
          print(token, predicts_new[i].tolist()[:
      5])
     
    
   
    
     
    
    
     
      for i, token 
      in enumerate(tokens2):
     
    
   
    
     
    
    
     
          print(token, predicts_new[i].tolist()[:
      5])


  
   
    
     
    
    
     
      #加载语言模型
     
    
   
    
     
    
    
     
      model = build_bert_model(config_path=config_path, checkpoint_path=checkpoint_path, with_mlm=
      True)
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      token_ids, segment_ids = tokenizer.encode(
      u'科学技术是第一生产力')
     
    
   
    
     
    
    
     
      # mask掉“技术”
     
    
   
    
     
    
    
     
      token_ids[
      3] = token_ids[
      4] = token_dict[
      '[MASK]']
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      # 用mlm模型预测被mask掉的部分
     
    
   
    
     
    
    
     
      probas = model.predict([np.array([token_ids]), np.array([segment_ids])])[
      0]
     
    
   
    
     
    
    
     
      print(tokenizer.decode(probas[
      3:
      5].argmax(axis=
      1))) 
      # 结果正是“技术”
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      token_ids, segment_ids = tokenizer.encode(
      u'数学是利用符号语言研究数量、结构、变化以及空间等概念的一门学科')
     
    
   
    
     
    
    
     
      # mask掉“技术”
     
    
   
    
     
    
    
     
      token_ids[
      1] = token_ids[
      2] = tokenizer._token_dict[
      '[MASK]']
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      # 用mlm模型预测被mask掉的部分
     
    
   
    
     
    
    
     
      probas = model.predict([np.array([token_ids]), np.array([segment_ids])])[
      0]
     
    
   
    
     
    
    
     
      print(tokenizer.decode(probas[
      1:
      3].argmax(axis=
      1))) 
      # 结果正是“数学”

转载：https://blog.csdn.net/zwqjoy/article/details/103671088

查看评论

飞道的博客

飞道的博客

个人资料

文章分类

文章存档

阅读排行

评论排行

推荐文章