小言_互联网的博客

Transform nmt translate翻译模型代码示例

298人阅读  评论(0)
import tensorflow_datasets as tfds
import tensorflow as tf

import time
import numpy as np
import matplotlib.pyplot as plt


#加载数据
examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True,
                               as_supervised=True)
#print(metadata)
#print(examples)
#{'test': <_OptionsDataset shapes: ((), ()), types: (tf.string, tf.string)>, 
#'train': <_OptionsDataset shapes: ((), ()), types: (tf.string, tf.string)>,
#'validation': <_OptionsDataset shapes: ((), ()), types: (tf.string, tf.string)>}
train_examples, val_examples = examples['train'], examples['validation']
#print(train_examples)
#print(val_examples)
#<_OptionsDataset shapes: ((), ()), types: (tf.string, tf.string)>
#<_OptionsDataset shapes: ((), ()), types: (tf.string, tf.string)>
#
#print('pt:', (en.numpy() for pt, en in train_examples))



#分词  建立子词汇表
tokenizer_en = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    (en.numpy() for pt, en in train_examples), target_vocab_size=2**13)
#8192要创建的词汇表的大致大小。

tokenizer_pt = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    (pt.numpy() for pt, en in train_examples), target_vocab_size=2**13)

#print(tokenizer_en)
#print(tokenizer_pt)
#<SubwordTextEncoder vocab_size=8087>
#<SubwordTextEncoder vocab_size=8214>
print(tokenizer_pt.vocab_size)
print(tokenizer_en.vocab_size)
#8214
#8087





#建立子词汇映射表 转成id 
sample_string = 'Transformer is awesome.'

tokenized_string = tokenizer_en.encode(sample_string)
print ('Tokenized string is {}'.format(tokenized_string))
#Tokenized string is [7915, 1248, 7946, 7194, 13, 2799, 7877]

original_string = tokenizer_en.decode(tokenized_string)
print ('The original string: {}'.format(original_string))
#The original string: Transformer is awesome.

assert original_string == sample_string

for ts in tokenized_string:
  print ('{} ----> {}'.format(ts, tokenizer_en.decode([ts])))

#7915 ----> T
#1248 ----> ran
#7946 ----> s
#7194 ----> former 
#13 ----> is 
#2799 ----> awesome
#7877 ----> .




BUFFER_SIZE = 20000
BATCH_SIZE = 64


#将一个开始和结束标记添加到输入和目标。
#开始结束标记放在了最后
##train_examples <_OptionsDataset shapes: ((), ()), types: (tf.string, tf.string)>
##8214
#8087
def encode(lang1, lang2):
  #print('lang1.numpy()>>:',lang1.numpy()," lang2.numpy()>>:", lang2.numpy())
  #lang1.numpy()>>: b'espero que todos consigam ver .'  lang2.numpy()>>: b'so i hope you can see .'
  lang1 = [tokenizer_pt.vocab_size] + tokenizer_pt.encode(
      lang1.numpy()) + [tokenizer_pt.vocab_size+1]
 

  lang2 = [tokenizer_en.vocab_size] + tokenizer_en.encode(
      lang2.numpy()) + [tokenizer_en.vocab_size+1]

  #print('lang1:',lang1,"lang2:",lang2)
  #ang1.numpy()>>: b'espero que todos consigam ver .'  lang2.numpy()>>: b'so i hope you can see .'
  #lang1: [8214, 1259, 5, 63, 5284, 50, 277, 2, 8215] lang2: [8087, 18, 12, 631, 15, 31, 272, 2, 8088]
  return lang1, lang2



#为了使此示例保持较小且相对较快,请删除长度超过40个令牌的示例。
#train_examples <_OptionsDataset shapes: ((), ()), types: (tf.string, tf.string)>
#根基Bool值进行过滤
MAX_LENGTH = 40
def filter_max_length(x, y, max_length=MAX_LENGTH):
    
  #print('x:',x,"y:",y)
  #print("tf.logical_and:>>",tf.logical_and(tf.size(x) <= max_length,
 #                    tf.size(y) <= max_length)
  
  return tf.logical_and(tf.size(x) <= max_length,
                        tf.size(y) <= max_length)


#x: Tensor("args_0:0", dtype=int64) y: Tensor("args_1:0", dtype=int64)
#tf.logical_and:>> Tensor("LogicalAnd:0", shape=(), dtype=bool)
#x: Tensor("args_0:0", dtype=int64) y: Tensor("args_1:0", dtype=int64)
#tf.logical_and:>> Tensor("LogicalAnd:0", shape=(), dtype=bool)









##train_examples <_OptionsDataset shapes: ((), ()), types: (tf.string, tf.string)>
def tf_encode(pt, en):
  #
  return tf.py_function(encode, [pt, en], [tf.int64, tf.int64])

train_dataset = train_examples.map(tf_encode)
#print('train_dataset1',train_dataset)
#train_examples <_OptionsDataset shapes: ((), ()), types: (tf.string, tf.string)>
#过滤  根据Bool值进行
train_dataset = train_dataset.filter(filter_max_length)
#print('train_dataset2',train_dataset)
# cache the dataset to memory to get a speedup while reading from it.
train_dataset = train_dataset.cache()
#print('train_dataset3',train_dataset)
train_dataset = train_dataset.shuffle(BUFFER_SIZE).padded_batch(
    BATCH_SIZE, padded_shapes=([-1], [-1]))
#print('train_dataset4',train_dataset)
train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)
#print('train_dataset5',train_dataset)
#train_dataset1 <MapDataset shapes: (<unknown>, <unknown>), types: (tf.int64, tf.int64)>
#train_dataset2 <FilterDataset shapes: (<unknown>, <unknown>), types: (tf.int64, tf.int64)>
#train_dataset3 <CacheDataset shapes: (<unknown>, <unknown>), types: (tf.int64, tf.int64)>
#train_dataset4 <PaddedBatchDataset shapes: ((None, None), (None, None)), types: (tf.int64, tf.int64)>
#train_dataset5 <PrefetchDataset shapes: ((None, None), (None, None)), types: (tf.int64, tf.int64)>



val_dataset = val_examples.map(tf_encode)
val_dataset = val_dataset.filter(filter_max_length).padded_batch(
    BATCH_SIZE, padded_shapes=([-1], [-1]))

pt_batch, en_batch = next(iter(val_dataset))
pt_batch, en_batch
#print("pt_batch:",pt_batch,"en_batch:",en_batch)
#pt_batch: tf.Tensor(
#[[8214 1259    5 ...    0    0    0]
# [8214  299   13 ...    0    0    0]
#[8214   59    8 ...    0    0    0]
# ...
# [8214   95    3 ...    0    0    0]
# [8214 5157    1 ...    0    0    0]
# [8214 4479 7990 ...    0    0    0]], shape=(64, 40), dtype=int64)
#en_batch: tf.Tensor(
#[[8087   18   12 ...    0    0    0]
# [8087  634   30 ...    0    0    0]
#[8087   16   13 ...    0    0    0]
# ...
#[8087   12   20 ...    0    0    0]
# [8087   17 4981 ...    0    0    0]
#[8087   12 5453 ...    0    0    0]], shape=(64, 40), dtype=int64)






#提供位置信息
#pos(8216,1)  i(1,512)   d_model=512
def get_angles(pos, i, d_model):
  #(1,512)  
  #
  angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
  #(1,512) 
  #pos(8216,1) *(1,512) 
  #(8216,512)位置信息矩阵
  return pos * angle_rates


#angle_rates (1,512)  
#[[1.00000000e+00 1.00000000e+00 9.64661620e-01 9.64661620e-01
#  9.30572041e-01 9.30572041e-01 8.97687132e-01 8.97687132e-01
#  8.65964323e-01 8.65964323e-01 8.35362547e-01 8.35362547e-01
#  8.05842188e-01 8.05842188e-01 7.77365030e-01 7.77365030e-01
#  ..............................................................................................................
#  1.33352143e-04 1.33352143e-04 1.28639694e-04 1.28639694e-04
#  1.24093776e-04 1.24093776e-04 1.19708503e-04 1.19708503e-04
#  1.15478198e-04 1.15478198e-04 1.11397386e-04 1.11397386e-04
#  1.07460783e-04 1.07460783e-04 1.03663293e-04 1.03663293e-04]]

#angle_rads
#angle_rads:  (50, 512)
#[[0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
#  0.00000000e+00 0.00000000e+00]
# [1.00000000e+00 1.00000000e+00 9.64661620e-01 ... 1.07460783e-04
#  1.03663293e-04 1.03663293e-04]
# [2.00000000e+00 2.00000000e+00 1.92932324e+00 ... 2.14921566e-04
#  2.07326586e-04 2.07326586e-04]
# ...
# [4.70000000e+01 4.70000000e+01 4.53390961e+01 ... 5.05065679e-03
#  4.87217476e-03 4.87217476e-03]
# [4.80000000e+01 4.80000000e+01 4.63037578e+01 ... 5.15811758e-03
#  4.97583806e-03 4.97583806e-03]
# [4.90000000e+01 4.90000000e+01 4.72684194e+01 ... 5.26557836e-03
#  5.07950135e-03 5.07950135e-03]]





##(1,50,512)  最终位置信息矩阵
#nput_vocab_size8216   d_model512
#(8216,512)
def positional_encoding(position, d_model):
  
  
  #pos(8216,1)  i(1,512)   d_model=512
  angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                          np.arange(d_model)[np.newaxis, :],
                          d_model)
  #angle_rads (8216,512)位置信息矩阵


  # apply sin to even indices in the array; 2i
  #(50,512) 偶数位置sin  step=2
  angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
  
  # apply cos to odd indices in the array; 2i+1
  #奇数cos
  angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    
  #(1,8216,512)  最终位置信息矩阵
  pos_encoding = angle_rads[np.newaxis, ...]
  print('pos_encoding',pos_encoding.shape)
  #pos_encoding(1,8216,512)
    
  return tf.cast(pos_encoding, dtype=tf.float32)

#(np.arange(position):>>> [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
#24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
# 48 49....................8215]

#pos(8216,1) 
#(np.arange(position)[:, np.newaxis]:>>> 
#[[ 0]
#[ 1]
# [ 2]
#[ 3]
# ...
# [48]
# [49]
#...
#[8215]]

#np.arange(d_model):>>> [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
#  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
#..............................................................................................................
# 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503
# 504 505 506 507 508 509 510 511]

#(1,512)
#(np.arange(d_model)[np.newaxis, :]>>> [[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
#   18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
#   36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
#.......................................................................................................
#  486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503
#  504 505 506 507 508 509 510 511]]



"""
##(50,512)
pos_encoding = positional_encoding(50, 512)
print (pos_encoding.shape)
#(1, 50, 512)
#(50,512)矩阵数值分布图
plt.pcolormesh(pos_encoding[0], cmap='RdBu')
plt.xlabel('Depth')
plt.xlim((0, 512))
plt.ylabel('Position')
plt.colorbar()
plt.show()
"""





#Mask
#mask 表示掩码,它对某些值进行掩盖,使其在参数更新时不产生效果。Transformer 模型里面涉及两种 mask,
#分别是 padding mask 和 sequence mask。
#inp: (64, 39)  
def create_padding_mask(seq):
  #print('seq:',seq)

  #print('create_padding_mask>>seq:',seq.shape)
  #print('对位置为0的位置进行掩码')
  #inp: (64, 39)  
  seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
  #print(seq)
  #inp: (64, 39)  
  
  # add extra dimensions to add the padding
  # to the attention logits.
  return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)

#seq: tf.Tensor(
#[[7 6 0 0 1]
#[1 2 3 0 0]
 #[0 0 0 4 5]], shape=(3, 5), dtype=int32)
#padding mask  掩码
#tf.Tensor(
#[[0. 0. 1. 1. 0.]
 #[0. 0. 0. 1. 1.]
# [1. 1. 1. 0. 0.]], shape=(3, 5), dtype=float32)




"""
test
#(3,5)
#padding_mask
x = tf.constant([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]])
create_padding_mask(x)
"""


#seqence_mask
#tar_inp: (64, 39)
def create_look_ahead_mask(size):
  #print('tf.ones:',tf.ones((size,size)))
  #print('tf.linalg.band:',tf.linalg.band_part(tf.ones((size, size)), -1, 0))
  #下三角矩阵  下三角全部保留  上三角舍弃
  #
  #print('create_look_ahead_mask')
  mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
  #print('下三角矩阵  下三角全部保留  上三角舍弃,然后取反 ,上三角都是1 进行掩码')
  #print('mask',mask)
  return mask  # (seq_len, seq_len)





"""
x = tf.random.uniform((1, 3))
#print(x,'x.shape:',x.shape[1])
#序列长度
temp = create_look_ahead_mask(x.shape[1])
temp
"""


#tf.Tensor([[0.3118695  0.44500995 0.14591718]], shape=(1, 3), dtype=float32) x.shape: 3
#tf.ones: tf.Tensor(
#[[1. 1. 1.]
# [1. 1. 1.]
# [1. 1. 1.]], shape=(3, 3), dtype=float32)
#tf.linalg.band: tf.Tensor(
#[[1. 0. 0.]
# [1. 1. 0.]
# [1. 1. 1.]], shape=(3, 3), dtype=float32)
#最终得到结果  seqencemask
#mask tf.Tensor(
#[[0. 1. 1.]
# [0. 0. 1.]
# [0. 0. 0.]], shape=(3, 3), dtype=float32)






#自我注意模块 self-attention
#q(64,8,39,64) k(64,8,39,64)  v(64,8,39,64)  mask enc_padding_mask(64, 1, 1, 39)   encoder_layer
#x (64,39,512)  look_ahead_mask(39, 39)   decoder_layer_look_head_mask
def scaled_dot_product_attention(q, k, v, mask):
  """Calculate the attention weights.
  q, k, v must have matching leading dimensions.
  k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
  The mask has different shapes depending on its type(padding or look ahead) 
  but it must be broadcastable for addition.
  
  Args:
    q: query shape == (..., seq_len_q, depth)
    k: key shape == (..., seq_len_k, depth)
    v: value shape == (..., seq_len_v, depth_v)
    mask: Float tensor with shape broadcastable 
          to (..., seq_len_q, seq_len_k). Defaults to None.
    
  Returns:
    output, attention_weights
  """



  #q,k,v,mask(64,8,39,64)(batch_size, num_heads, seq_len_v, depth)
  #q*k转置>>>(64,8,39,39)>>>matmul_qk
  #q(64,8,39,64) k(64,8,39,64)  v(64,8,39,64)  mask enc_padding_mask(64, 1, 1, 39) 
  
  matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)
  #matmul_qk  (64,8,39,39)
    

  # scale matmul_qk
  #dk>>k(64,8,39,64).shape[-1]>>64
  dk = tf.cast(tf.shape(k)[-1], tf.float32)
  #64
    

  #缩放 
  #matmul_qk>>(64,8,39,39)
  scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
  #scaled_attention_logits (64,8,39,39)
  

  # add the mask to the scaled tensor.
  #enc_padding_mask(64, 1, 1, 39) 
  #去除编码过程中  padding影响
  if mask is not None:
    scaled_attention_logits += (mask * -1e9)  
    
  
  #scaled_attention_logits(64,8,39,39)

    
  #softmax 层
  # softmax is normalized on the last axis (seq_len_k) so that the scores
  # add up to 1.
  #softmax 
  #attention_weights>>> (64,8,39,39)
  attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)
  #attention_weights( (64,8,39,39)*(64,8,39,64)v>>>output>>(64,8,39,64)
    
  #q*k转/dk  *v
  output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)
  #scal_dot_attention (64,8,39,64)


  return output, attention_weights








"""
#在控制台输出过程中,默认小数会以科学计数法的形式输出,若不需要,则可以用以下代码
np.set_printoptions(suppress=True)

temp_k = tf.constant([[10,0,0],
                      [0,10,0],
                      [0,0,10],
                      [0,0,10]], dtype=tf.float32)  # (4, 3)

temp_v = tf.constant([[   1,0],
                      [  10,0],
                      [ 100,5],
                      [1000,6]], dtype=tf.float32)  # (4, 2)
"""



#测试
# This `query` aligns with the second `key`,
# so the second `value` is returned.
#
#temp_q = tf.constant([[0, 10, 0]], dtype=tf.float32)  # (1, 3)

#q(1,3)  k(4,3)  k装置 q*k (1,3)*(3,4)>(1,4)  除以装置后的/2 > +mask  >*v>(4,2)>>(1,2)
#print_out(temp_q, temp_k, temp_v)
#自我注意计算流程 及伪代码
#q: tf.Tensor([[ 0. 10.  0.]], shape=(1, 3), dtype=float32) 
#k: tf.Tensor(
#[[10.  0.  0.]
#[ 0. 10.  0.]
#[ 0.  0. 10.]
# [ 0.  0. 10.]], shape=(4, 3), dtype=float32)
#v: tf.Tensor(
#[[   1.    0.]
# [  10.    0.]
#[ 100.    5.]
# [1000.    6.]], shape=(4, 2), dtype=float32)
#mask: None

#q*k转置  (1,3)×(3,4)>(1,4)>>
#matmul_qk: tf.Tensor([[  0. 100.   0.   0.]], shape=(1, 4), dtype=float32) 
#k>transpose_b: tf.Tensor(.., shape=(4, 3), dtype=float32)
#dk: tf.Tensor(3.0, shape=(), dtype=float32)

#scaled_attention_logits: tf.Tensor([[ 0.       57.735027  0.        0.      ]], shape=(1, 4), dtype=float32)
#scaled_attention_logits +:> tf.Tensor([[ 0.       57.735027  0.        0.      ]], shape=(1, 4), dtype=float32)
#attention_weights tf.Tensor([[0. 1. 0. 0.]], shape=(1, 4), dtype=float32)

#a_w*v >>(1,4)*(4,2)>>>(1,2)
#output: tf.Tensor([[10.  0.]], shape=(1, 2), dtype=float32)
#得到注意力和权重   


"""
#test
# This query aligns with a repeated key (third and fourth), 
# so all associated values get averaged.
temp_q = tf.constant([[0, 0, 10]], dtype=tf.float32)  # (1, 3)
print_out(temp_q, temp_k, temp_v)

#test
# This query aligns equally with the first and second key, 
# so their values get averaged.
temp_q = tf.constant([[10, 10, 0]], dtype=tf.float32)  # (1, 3)
print_out(temp_q, temp_k, temp_v)


#test
temp_q = tf.constant([[0, 0, 10], [0, 10, 0], [10, 10, 0]], dtype=tf.float32)  # (3, 3)
print_out(temp_q, temp_k, temp_v)


"""





#多头注意力
#
class MultiHeadAttention(tf.keras.layers.Layer):
    
  #512  8
  def __init__(self, d_model, num_heads):
    super(MultiHeadAttention, self).__init__()
    self.num_heads = num_heads
    self.d_model = d_model
    
    assert d_model % self.num_heads == 0
    
    #512//8==64
    self.depth = d_model // self.num_heads
    #print('self.depth',self.depth)
    #
    self.wq = tf.keras.layers.Dense(d_model)
    self.wk = tf.keras.layers.Dense(d_model)
    self.wv = tf.keras.layers.Dense(d_model)
    
    #
    self.dense = tf.keras.layers.Dense(d_model)
        
        
  #(1,60,512),1
  #x(64, 40, 512) (batchsize,seqlength,d_model)  encoder_layer
  #x (64,39,512)  look_ahead_mask(39, 39)   decoder_layer_look_head_mask
  def split_heads(self, x, batch_size):
    """Split the last dimension into (num_heads, depth).
    Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
    """
    #(1,60 ,8,64)
    #(64, 40, 8,64) >>> (batch_size, num_heads, seq_len, depth)
    x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
    #转置
    #(1,8,60,64)
    return tf.transpose(x, perm=[0, 2, 1, 3])
    
    
    
    
    
  #多头注意力
  #q(64,39,512) k(64,39,512) v(64,39,512)   enc_padding_mask(64, 1, 1, 39)  encoder_layer
  #x (64,39,512)  look_ahead_mask(39, 39)   decoder_layer_look_head_mask
  def call(self, v, k, q, mask):
        

    #64 
    batch_size = tf.shape(q)[0]
    
    
    
    #q(64,39,512) >q(64,39,512) 
    #参数 512x512+512  WQ
    #qq(64,39,512) >>全连接层>> q(64,39,512) 
    q = self.wq(q)  # (batch_size, seq_len, d_model)
    #(64,39,512)>((64,39,512)
    #k(64,39,512)>>全连接层>>k(64,39,512)  WK
    k = self.wk(k)  # (batch_size, seq_len, d_model)
    ##(64,39,512)>(64,39,512)
    #v(64,39,512)>>全连接层>>v(64,39,512)  WV
    v = self.wv(v)  # (batch_size, seq_len, d_model)
    #(batch_size, num_heads, seq_len, depth)  64,8,60,64)
    #拆分8组头注意力
    #q(64, 39, 512)>>(batch_size, num_heads, seq_len_q, depth)(64,8,39,64)
    q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
    #(batch_size, num_heads, seq_len, depth)  1,8,60,64)
    ##k(64, 39, 512)>>(batch_size, num_heads, seq_len_q, depth)(64,8,39,64)
    k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
    #(batch_size, num_heads, seq_len, depth)  1,8,60,64)
    ##v(64, 39, 512)>>(batch_size, num_heads, seq_len_q, depth)(64,8,39,64)
    v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)
    #q(64,8,39,64) k(64,8,39,64)  v(64,8,39,64)  mask enc_padding_mask(64, 1, 1, 39) 
    
    #scaled_dot_attention层
    #q(64,8,39,64) k(64,8,39,64)  v(64,8,39,64)  mask enc_padding_mask(64, 1, 1, 39)  encoder_layer
    # x (64,39,512)  look_ahead_mask(39, 39)   decoder_layer_look_head_mask
    scaled_attention, attention_weights = scaled_dot_product_attention(
        q, k, v, mask)
    # (batch_size, num_heads, seq_len_q, depth)(64,8,39,64)>>(batch_size, seq_len_q, num_heads, depth)(64,39,8,64)
    ##attention_weights (64, 8, 39, 39)
    #scal_dot_attention (64,8,39,64)
    #scaled_attention: (64, 8, 39, 64)
    scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)
    ##attention_weights (64, 8, 60, 60)
    #scaled_attention: (64, 8, 60, 64)>>(batch_size, seq_len_q, num_heads, depth)(64,60,8,64)
    # scaled_attention(64,60,8,64)

    #8组连接在一起  连接层
    #scaled_attention(64, 40,8, 64)>>scaled_attention(64, 40,512)
    concat_attention = tf.reshape(scaled_attention, 
                                  (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)
    #concat_attention(64, 40,512)  (batch_size, seq_len_q, d_model)

    #全连接512
    #concat_attention(64, 40,512) 
    output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)
    #output>>mult_head_attention(64, 40,512)   attention_weights(64, 8, 40, 40)
    
    
    return output, attention_weights




"""
#test
temp_mha = MultiHeadAttention(d_model=512, num_heads=8)
#(1,60,512)
y = tf.random.uniform((1, 60, 512))  # (batch_size, encoder_sequence, d_model)

#attention_weights (1, 8, 60, 60)* v: (1, 8, 60, 64)>>
# output: (1, 8, 60, 64)
out, attn = temp_mha(y, k=y, q=y, mask=None)
out.shape, attn.shape
#伪代码流程
#

"""




#点式前馈网络
#(512, 2048)
def point_wise_feed_forward_network(d_model, dff):
    
    
  return tf.keras.Sequential([
      #2048
      #(64, 50, 512)>>(64, 50, 2048)
      tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
      #(64, 50, 2048)>>(64, 50, 512)
      tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
  ])



"""
sample_ffn = point_wise_feed_forward_network(512, 2048)
sample_ffn(tf.random.uniform((64, 50, 512))).shape
"""



"""
Encoder layer
Each encoder layer consists of sublayers:

Multi-head attention (with padding mask)
Point wise feed forward networks.
Each of these sublayers has a residual connection around it followed by a layer normalization. 
Residual connections help in avoiding the vanishing gradient problem in deep networks.

The output of each sublayer is LayerNorm(x + Sublayer(x)). The normalization is done on the d_model (last) axis. 
There are N encoder layers in the transformer.
"""
##x(64,39,512) mask>enc_padding_mask(64, 1, 1, 39) 
class EncoderLayer(tf.keras.layers.Layer):
    
    
  #512  8  2048
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(EncoderLayer, self).__init__()

    #多头注意力
    
    self.mha = MultiHeadAttention(d_model, num_heads)
    #Pwff
    self.ffn = point_wise_feed_forward_network(d_model, dff)

    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    
    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)
    
  ##x(64,39,512) mask>enc_padding_mask(64, 1, 1, 39) 
  #(batchsize,input_seq_len,d_model)
  def call(self, x, training, mask):

    #x为 文本序列embedding向量和位置信息向量之和作为输入
    #多头注意力   q(64,39,512) k(64,39,512) v(64,39,512)   enc_padding_mask(64, 1, 1, 39) 
    attn_output, _ = self.mha(x, x, x, mask)  # (batch_size, input_seq_len, d_model)
    #attn_output, _: (64, 39, 512)   _ (64, 8, 39, 39)
    
    #dropout1
    attn_output = self.dropout1(attn_output, training=training)
    #attn_output, _: (64, 39, 512) 
    
    #残差和  LN
    #x (64,39,512)
    out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)
    #out1: ((64,39,512)
    
    #PWFF层 前馈网络
    #x (64,39,512)>>x (64,39,2048)>(64,39,512)
    ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
    #ffn_output:  (64,39,512)
    
    #dropout2
    ffn_output = self.dropout2(ffn_output, training=training)
    #ffn_output(64,39,512)
    
    #残差 +LN
    #out2:(out1 + ffn_output) (64,39,512)
    out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)
    #out2:(out1 + ffn_output) (64,39,512)
    
    
    
    
    return out2

'''
sample_encoder_layer = EncoderLayer(512, 8, 2048)

sample_encoder_layer_output = sample_encoder_layer(
    tf.random.uniform((64, 43, 512)), False, None)

#sample_encoder_layer.summary()

'''


#伪代码 
"""
scal_d_attetention
EncoderLayer>>>>>>>>>>>>>>>>>
muilt_head_attention>>>>>>>>>>>>
q: (64, 8, 40, 64) k: (64, 8, 40, 64) v: (64, 8, 40, 64) mask: None
transpose>>K: (64, 40, 8, 64)
matmul_qk: (64, 8, 40, 40) k>transpose_b: (64, 8, 40, 64)
dk: tf.Tensor(64.0, shape=(), dtype=float32)
scaled_attention_logits: (64, 8, 40, 40)
scaled_attention_logits +:> (64, 8, 40, 40)
attention_weights (64, 8, 40, 40)
output: (64, 8, 40, 64)
#muilt_head_attention
相加(64, 40, 512)
ln
attn_output(64, 40, 512) attention_weights (64, 8, 40, 40)
attn_output, _: (64, 40, 512) (64, 8, 40, 40)
attn_output: (64, 40, 512)
out1: (64, 40, 512)
ffn_output: (64, 40, 512)
ffn_output:dropout2 (64, 40, 512)
out2:(out1 + ffn_output) (64, 40, 512)
"""








"""
Decoder layer
Each decoder layer consists of sublayers:

Masked multi-head attention (with look ahead mask and padding mask)
Multi-head attention (with padding mask). V (value) and K (key) receive the encoder output as inputs. 
Q (query) receives the output from the masked multi-head attention sublayer.
Point wise feed forward networks
Each of these sublayers has a residual connection around it followed by a layer normalization. 
The output of each sublayer is LayerNorm(x + Sublayer(x)). The normalization is done on the d_model (last) axis.

There are N decoder layers in the transformer.

As Q receives the output from decoder's first attention block, and K receives the encoder output, 
the attention weights represent the importance given to the decoder's input based on the encoder's output. 
In other words, the decoder predicts the next word by looking at the encoder output and self-attending to its own output.
See the demonstration above in the scaled dot product attention section.
"""


#(512, 8, 2048)
class DecoderLayer(tf.keras.layers.Layer):
    
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(DecoderLayer, self).__init__()

    self.mha1 = MultiHeadAttention(d_model, num_heads)
    self.mha2 = MultiHeadAttention(d_model, num_heads)

    self.ffn = point_wise_feed_forward_network(d_model, dff)
 
    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    
    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)
    self.dropout3 = tf.keras.layers.Dropout(rate)
    
    
  #x (64,39,512) 目标标签  和目标位置序列信息
  #enc_output: (64, 39, 512) 
  #look_ahead_mask(39, 39)  dec_padding_mask (64, 1, 1, 39)  
  def call(self, x, enc_output, training, 
           look_ahead_mask, padding_mask):
    # enc_output.shape == (batch_size, input_seq_len, d_model)

    
    #多头注意力  look_ahead_mask
    #x为目标序列向量
    #x (64,39,512)  look_ahead_mask(39, 39) 
    attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)  # (batch_size, target_seq_len, d_model)
    #attn1 (64, 39, 512)  attn_weights_block1 (64, 8, 39,39)
    
    attn1 = self.dropout1(attn1, training=training)
    #attn1 (64, 39, 512)  attn_weights_block1 (64, 8, 39,39)
    
    #残差层+LN
    out1 = self.layernorm1(attn1 + x)
    #out1 (64, 39, 512)  attn_weights_block1 (64, 8, 39,39)
    
    
    #多头注意力
    #q=enc_output(64, 39, 512) 
    #k=enc_output(64, 39, 512) 
    #v=attn1  (64, 39, 512) 第一层注意力
    #padding_mask=dec_padding_mask (64, 1, 1, 39)  
    attn2, attn_weights_block2 = self.mha2(
        enc_output, enc_output, out1, padding_mask)  # (batch_size, target_seq_len, d_model)
    
    #attn2: (64, 39, 512) attn_weights_block2 (64, 8, 39, 39)
   

    attn2 = self.dropout2(attn2, training=training)
    #attn2: (64, 39, 512) attn_weights_block2 (64, 8, 39, 39)
    
    #attn2: (64, 39, 512)+attn1 (64, 39, 512)  
    out2 = self.layernorm2(attn2 + out1)  # (batch_size, target_seq_len, d_model)
    ##attn2: (64, 39, 512) attn_weights_block2 (64, 8, 39, 39)
    
    
    #pwff 层
    #attn2: (64, 39, 512) 
    ffn_output = self.ffn(out2)  # (batch_size, target_seq_len, d_model)
    #attn2: (64, 39, 512) 
    
    
    ffn_output = self.dropout3(ffn_output, training=training)
    ##attn2: (64, 39, 512) 
    
    #残差 +LN
    out3 = self.layernorm3(ffn_output + out2)  # (batch_size, target_seq_len, d_model)
    #decoder_out: (64, 39, 512)  attn_weights_block1 (64, 8, 39, 39)  attn_weights_block2(64, 8, 39, 39)
    

    return out3, attn_weights_block1, attn_weights_block2


"""
sample_decoder_layer = DecoderLayer(512, 8, 2048)

sample_decoder_layer_output, _, _ = sample_decoder_layer(
    tf.random.uniform((64, 40, 512)), sample_encoder_layer_output, 
    False, None, None)

sample_decoder_layer_output.shape  # (batch_size, target_seq_len, d_model)
"""



#伪代码
"""
DecoderLayer>>>>>>>>>>>>>>>>>>>>>
muilt_head_attention>>>>>>>>>>>>

q: (64, 8, 40, 64) k: (64, 8, 40, 64) v: (64, 8, 40, 64) mask: None
transpose>>K: (64, 40, 8, 64)
matmul_qk: (64, 8, 40, 40) k>transpose_b: (64, 8, 40, 64)
dk: tf.Tensor(64.0, shape=(), dtype=float32)
scaled_attention_logits: (64, 8, 40, 40)
scaled_attention_logits +:> (64, 8, 40, 40)
attention_weights (64, 8, 40, 40)*v>>
output: (64, 8, 40, 64)


muilt_head_attention>>>>>>>>>>>>
pwff
attn1: (64, 40, 512) attn_weights_block1 (64, 8, 40, 40)
attn1>>dropout1 (64, 40, 512)
out1>>layernorm1>>(attn1 + x) (64, 40, 512)
muilt_head_attention>>>>>>>>>>>>

q: (64, 8, 40, 64) k: (64, 8, 40, 64) v: (64, 8, 40, 64) mask: None
transpose>>K: (64, 40, 8, 64)
matmul_qk: (64, 8, 40, 40) k>transpose_b: (64, 8, 40, 64)
dk: tf.Tensor(64.0, shape=(), dtype=float32)
scaled_attention_logits: (64, 8, 40, 40)
scaled_attention_logits +:> (64, 8, 40, 40)
attention_weights (64, 8, 40, 40)
output: (64, 8, 40, 64)
attn2: (64, 40, 512) attn_weights_block2 (64, 8, 40, 40)
attn2>>dropout1 (64, 40, 512)
out2>>layernorm1>>(attn2 + out1) (64, 40, 512)
ffn_output>>ffn (64, 40, 512)
ffn_output>>dropout3 (64, 40, 512)
out3>>>layernorm3(ffn_output + out2) (64, 40, 512)
out3: (64, 40, 512) attn_weights_block1: (64, 8, 40, 40) attn_weights_block2: (64, 8, 40, 40)
"""




"""
Encoder
The Encoder consists of: 1. Input Embedding 2. Positional Encoding 3. N encoder layers

The input is put through an embedding which is summed with the positional encoding. 
The output of this summation is the input to the encoder layers. The output of the encoder is the input to the decoder.
##8214+2  8216
#8087+2  8089
"""
class Encoder(tf.keras.layers.Layer):
    
  #num_layers 6 d_model512  num_heads8   input_vocab_size8214
  def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, 
               rate=0.1):
    super(Encoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers
    #x  (64,39, 8216) >>(64,39,512)
    #input_vocab_size8216   d_model512
    self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
    #8214
    self.pos_encoding = positional_encoding(input_vocab_size, self.d_model)
    #pos_encoding(1,8216,512)
    
    
    self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) 
                       for _ in range(num_layers)]
  
    self.dropout = tf.keras.layers.Dropout(rate)
        
  #x>inp : x  (64, 39)  mask>enc_padding_mask(64, 1, 1, 39) 
  def call(self, x, training, mask):

    seq_len = tf.shape(x)[1]
    #39
    
    #embedding
    # adding embedding and position encoding.
    #x  (64, 39) 
    x = self.embedding(x)  # (batch_size, input_seq_len, d_model)
    #x(64,39,512)
    
    #缩放
    #(64,39,512)
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    #(64,39,512)
    
    #加上位置向量
    #x(64,39,512) +   #pos_encoding(1,8216,512)
    #?????????????
    x += self.pos_encoding[:, :seq_len, :]
    
    #print('x+self.pos_encoding[:, :seq_len, :]:',x.shape)
    #x(64,39,512) 
    x = self.dropout(x, training=training)
    #print('self.dropout(x:',x.shape)
    
    
    #x(64,39,512) mask>enc_padding_mask(64, 1, 1, 39) 
    for i in range(self.num_layers):
      x = self.enc_layers[i](x, training, mask)
    
    
    
    return x  # (batch_size, input_seq_len, d_model)

'''
#test  测试 
sample_encoder = Encoder(num_layers=6, d_model=512, num_heads=8, 
                         dff=2048, input_vocab_size=8214)

sample_encoder_output = sample_encoder(tf.random.uniform((64, 62)), 
                                       training=False, mask=None)

print (sample_encoder_output.shape)  # (batch_size, input_seq_len, d_model)
'''

"""
伪代码

"""




"""
Decoder
The Decoder consists of: 1. Output Embedding 2. Positional Encoding 3. N decoder layers

The target is put through an embedding which is summed with the positional encoding. 
The output of this summation is the input to the decoder layers. The output of the decoder is the input to the final linear layer.

"""
class Decoder(tf.keras.layers.Layer):
    
  #6,512,8,2048,8087
  def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size, 
               rate=0.1):
    super(Decoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers
    
    self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
    self.pos_encoding = positional_encoding(target_vocab_size, d_model)
    
    self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) 
                       for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(rate)
    
  ##tar (64, 39)   enc_output: (64, 39, 512) 
  #look_ahead_mask(39, 39)  dec_padding_mask (64, 1, 1, 39)  
  def call(self, x, enc_output, training, 
           look_ahead_mask, padding_mask):
    
    
    #x为目标标签  并不是输出
    seq_len = tf.shape(x)[1]
    #目标标签序列的长度
    
    attention_weights = {}
    
    #目标标签embedding 层
    #建立目标语言词汇表 
    x = self.embedding(x)  # (batch_size, target_seq_len, d_model)
    #(64,39,512)   (batch_size, target_seq_len, d_model)
    
    #
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    #(64,39,512) 
    
    
    #x (64,39,512)   +   #pos_encoding(1,8087,512)
    x += self.pos_encoding[:, :seq_len, :]
    
    
    #x (64,39,512)
    x = self.dropout(x, training=training)
    

    #x (64,39,512) 目标标签  和目标位置序列信息
    #enc_output: (64, 39, 512) 
    ##look_ahead_mask(39, 39)  dec_padding_mask (64, 1, 1, 39)  
    for i in range(self.num_layers):
      x, block1, block2 = self.dec_layers[i](x, enc_output, training,
                                             look_ahead_mask, padding_mask)
      
      attention_weights['decoder_layer{}_block1'.format(i+1)] = block1
      attention_weights['decoder_layer{}_block2'.format(i+1)] = block2
        
        
    
    # x.shape == (batch_size, target_seq_len, d_model)
    # x: (64, target_seq_len39, 512)
    ##decoder_out: (64, 39, 512)  attention_weights=[attn_weights_block1 (64, 8, 39, 39)  ,attn_weights_block2(64, 8, 39, 39)]
    
    return x, attention_weights


"""
test 
sample_decoder = Decoder(num_layers=6, d_model=512, num_heads=8, 
                         dff=2048, target_vocab_size=8089)

output, attn = sample_decoder(tf.random.uniform((64, 26)), 
                              enc_output=sample_encoder_output, 
                              training=False, look_ahead_mask=None, 
                              padding_mask=None)

output.shape, attn['decoder_layer2_block2'].shape
"""



"""
伪代码

"""





"""
Create the Transformer
Transformer consists of the encoder, decoder and a final linear layer. 
The output of the decoder is the input to the linear layer and its output is returned.
"""
class Transformer(tf.keras.Model):
    
  #
  def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, 
               target_vocab_size, rate=0.1):
    super(Transformer, self).__init__()
    

    self.encoder = Encoder(num_layers, d_model, num_heads, dff, 
                           input_vocab_size, rate)

    self.decoder = Decoder(num_layers, d_model, num_heads, dff, 
                           target_vocab_size, rate)

    self.final_layer = tf.keras.layers.Dense(target_vocab_size)
    
  #inp: (64, 39)  tar (64, 39)
  #enc_padding_mask(64, 1, 1, 39)   dec_padding_mask (64, 1, 1, 39)  
  #dec_target_padding_mask(64, 1, 1, 39)+look_ahead_mask(39, 39)=combined_mask(64, 1, 39, 39)  
  def call(self, inp, tar, training, enc_padding_mask, 
           look_ahead_mask, dec_padding_mask):

    
    
    #inp: (64, 39)  enc_padding_mask(64, 1, 1, 39) 
    enc_output = self.encoder(inp, training, enc_padding_mask)  # (batch_size, inp_seq_len, d_model)
    #enc_output: (64, 39, 512) (batch_size, inp_seq_len, d_model)
    
    
    
    #decoder模型  以上次encoder的输出作为输入  中间向量enc_output(64,39,512)
    # dec_output.shape == (batch_size, tar_seq_len, d_model)
    #tar (64, 39)   enc_output: (64, 39, 512) 
    #look_ahead_mask(39, 39)  dec_padding_mask (64, 1, 1, 39)  
    dec_output, attention_weights = self.decoder(
        tar, enc_output, training, look_ahead_mask, dec_padding_mask)
    #dec_output: (64, 39, 512)
    ###decoder_out: (64, 39, 512)  attention_weights=[attn_weights_block1 (64, 8, 39, 39)  ,attn_weights_block2(64, 8, 39, 39)]
   

    
    #预测模型
    #dec_output: (64, 39, 512)
    final_output = self.final_layer(dec_output)  # (batch_size, tar_seq_len, target_vocab_size)
    #final_output: (64, 39, 8089)  (batch_size, tar_seq_len, target_vocab_size)
    
    
    return final_output, attention_weights









'''
Set hyperparameters
To keep this example small and relatively fast, the values for num_layers, d_model, and dff have been reduced.

The values used in the base model of transformer were; num_layers=6, d_model = 512, dff = 2048.
See the paper for all the other versions of the transformer.
'''
num_layers = 6
d_model = 512
dff = 2048
num_heads = 8

input_vocab_size = tokenizer_pt.vocab_size + 2
target_vocab_size = tokenizer_en.vocab_size + 2
dropout_rate = 0.1



"""
Optimizer
Use the Adam optimizer with a custom learning rate scheduler according to the formula in the paper.


"""

class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super(CustomSchedule, self).__init__()
    
    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps
    
  def __call__(self, step):
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)
    
    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)


learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, 
                                     epsilon=1e-9)


temp_learning_rate_schedule = CustomSchedule(d_model)

plt.plot(temp_learning_rate_schedule(tf.range(40000, dtype=tf.float32)))
plt.ylabel("Learning Rate")
plt.xlabel("Train Step")


"""
Loss and metrics
Since the target sequences are padded, it is important to apply a padding mask when calculating the loss.
"""

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')


def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask
  
  return tf.reduce_mean(loss_)

train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
    name='train_accuracy')


transformer = Transformer(num_layers, d_model, num_heads, dff,
                          input_vocab_size, target_vocab_size, dropout_rate)




#inp: (64, 39)  tar_inp: (64, 39) (batchsize, seq_len)
def create_masks(inp, tar):
  # Encoder padding mask
  #
  #print('create_padding_mask>>enc_padding_mask')
  #inp: (64, 39)  
  enc_padding_mask = create_padding_mask(inp)
  #enc_padding_mask (64, 1, 1, 39)  (batch_size, 1, 1, seq_len)
  #print('inp>>:',inp.shape,'enc_padding_mask>>',enc_padding_mask.shape)
  
  # Used in the 2nd attention block in the decoder.
  # This padding mask is used to mask the encoder outputs.
  #np: (64, 39)  
  dec_padding_mask = create_padding_mask(inp)
  #dec_padding_mask (64, 1, 1, 39)  (batch_size, 1, 1, seq_len)
  #print('inp>>:',inp.shape,'dec_padding_mask>>',dec_padding_mask.shape)
  

  # Used in the 1st attention block in the decoder.
  # It is used to pad and mask future tokens in the input received by 
  # the decoder.
  #tar_inp: (64, 39)
  look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
  #print('tf.shape(tar)[1]>>:',tf.shape(tar)[1],'look_ahead_mask>>',look_ahead_mask.shape)
  #  look_ahead_mask>> (39, 39)
    
  #tar_inp: (64, 39)
  dec_target_padding_mask = create_padding_mask(tar)
  #print('tar>>:',tar.shape,'dec_target_padding_mask>>',dec_target_padding_mask.shape)
  #dec_target_padding_mask (64, 1, 1, 39)  (batch_size, 1, 1, seq_len)
    
  #dec_target_padding_mask (64, 1, 1, 39)  look_ahead_mask(39, 39)
  combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)
  #combined_mask: (64, 1, 39, 39)
  #print('2个掩码之和,共同作用')
  #print('2个掩码之和,共同作用combined_mask:',combined_mask.shape)
  
  #print('enc_padding_mask:',enc_padding_mask.shape,'combined_mask:',combined_mask.shape,'dec_padding_mask:',dec_padding_mask.shape)
  #enc_padding_mask(64, 1, 1, 39)   dec_padding_mask (64, 1, 1, 39)  
  #dec_target_padding_mask(64, 1, 1, 39)+look_ahead_mask(39, 39)=combined_mask(64, 1, 39, 39)  
  return enc_padding_mask, combined_mask, dec_padding_mask




checkpoint_path = "./checkpoints/train"

ckpt = tf.train.Checkpoint(transformer=transformer,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

# if a checkpoint exists, restore the latest checkpoint.
if ckpt_manager.latest_checkpoint:
  ckpt.restore(ckpt_manager.latest_checkpoint)
  print ('Latest checkpoint restored!!')
    
    
EPOCHS = 1
# The @tf.function trace-compiles train_step into a TF graph for faster
# execution. The function specializes to the precise shape of the argument
# tensors. To avoid re-tracing due to the variable sequence lengths or variable
# batch sizes (the last batch is smaller), use input_signature to specify
# more generic shapes.


#使用输入签名指定更通用的形状
train_step_signature = [
    tf.TensorSpec(shape=(None, None), dtype=tf.int64),
    tf.TensorSpec(shape=(None, None), dtype=tf.int64),
]



@tf.function()#input_signature=train_step_signature
def train_step(inp, tar):
    
  #print('inp:',inp.shape,'tar:',tar.shape)
  #inp: (64, 39) tar: (64, 40)
  #目标输入前闭后开
  tar_inp = tar[:, :-1]
  #目标标签
  tar_real = tar[:, 1:]
  
  #tar_inp: (64, 39) tar_real: (64, 39)
  #print('tar_inp:',tar_inp.shape,'tar_real:',tar_real.shape)
  
  #print('create_masks>>>>start>>>>>>>>>>>>>>>')
  #inp: (64, 39)  tar_inp: (64, 39)
  enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)
  #print('create_masks>>>>stop>>>>>>>>>>>>>>>')
  #enc_padding_mask(64, 1, 1, 39)   dec_padding_mask (64, 1, 1, 39)  
  #dec_target_padding_mask(64, 1, 1, 39)+look_ahead_mask(39, 39)=combined_mask(64, 1, 39, 39)  
  
  with tf.GradientTape() as tape:
    predictions, _ = transformer(inp, tar_inp, 
                                 True, 
                                 enc_padding_mask, 
                                 combined_mask, 
                                 dec_padding_mask)
    
    loss = loss_function(tar_real, predictions)

  gradients = tape.gradient(loss, transformer.trainable_variables)    
  optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
  
  train_loss(loss)
  train_accuracy(tar_real, predictions)




for epoch in range(EPOCHS):
  #一个纪元
  #print('epoch====',epoch,'<<<<<start00000000000000000000000000000000000000000000000000000000000000000')
  start = time.time()
  #print('start_time:',start)
  train_loss.reset_states()
  train_accuracy.reset_states()
  
  # inp -> portuguese, tar -> english
  #inp: (64, 39) tar: (64, 40)  inp: (64, 38) tar: (64, 37).........
  for (batch, (inp, tar)) in enumerate(train_dataset.take(1)):
    #一个Batch训练 64个
    train_step(inp, tar)
    
    if batch % 50 == 0:
      print ('Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}'.format(
          epoch + 1, batch, train_loss.result(), train_accuracy.result()))
      
  if (epoch + 1) % 5 == 0:
    ckpt_save_path = ckpt_manager.save()
    print ('Saving checkpoint for epoch {} at {}'.format(epoch+1,
                                                         ckpt_save_path))
    
  print ('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1, 
                                                train_loss.result(), 
                                                train_accuracy.result()))

  print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))
    
    

transformer.summary()
tf.keras.utils.plot_model(transformer,'transformer_info.png',show_shapes=True)





"""
伪代码
"""

#翻译

def evaluate(inp_sentence):
  start_token = [tokenizer_pt.vocab_size]
  end_token = [tokenizer_pt.vocab_size + 1]
  
  # inp sentence is portuguese, hence adding the start and end token
  inp_sentence = start_token + tokenizer_pt.encode(inp_sentence) + end_token
  encoder_input = tf.expand_dims(inp_sentence, 0)
  
  # as the target is english, the first word to the transformer should be the
  # english start token.
  decoder_input = [tokenizer_en.vocab_size]
  output = tf.expand_dims(decoder_input, 0)
    
  for i in range(MAX_LENGTH):
    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
        encoder_input, output)
  
    # predictions.shape == (batch_size, seq_len, vocab_size)
    predictions, attention_weights = transformer(encoder_input, 
                                                 output,
                                                 False,
                                                 enc_padding_mask,
                                                 combined_mask,
                                                 dec_padding_mask)
    
    # select the last word from the seq_len dimension
    predictions = predictions[: ,-1:, :]  # (batch_size, 1, vocab_size)

    predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)
    
    # return the result if the predicted_id is equal to the end token
    if predicted_id == tokenizer_en.vocab_size+1:
      return tf.squeeze(output, axis=0), attention_weights
    
    # concatentate the predicted_id to the output which is given to the decoder
    # as its input.
    output = tf.concat([output, predicted_id], axis=-1)

  return tf.squeeze(output, axis=0), attention_weights


def translate(sentence, plot=''):
  result, attention_weights = evaluate(sentence)
  
  predicted_sentence = tokenizer_en.decode([i for i in result 
                                            if i < tokenizer_en.vocab_size])  

  print('Input: {}'.format(sentence))
  print('Predicted translation: {}'.format(predicted_sentence))
  
  if plot:
    plot_attention_weights(attention_weights, sentence, result, plot)
    
    

translate("este é um problema que temos que resolver.")
print ("Real translation: this is a problem we have to solve .")

 


转载:https://blog.csdn.net/qq_29678299/article/details/101305597
查看评论
* 以上用户言论只代表其个人观点,不代表本网站的观点或立场