命名实体识别之bert+bilstm(基于tensorflow)
接下来我们继续对官方基于bert的模型进行扩展,之前的可参考:
命名实体识别之使用tensorflow的bert模型进行微调
命名实体识别之动态融合不同bert层的特征(基于tensorflow)
直接看代码:
class MyModel: def __init__(self, config): self.config = config # 喂入模型的数据占位符 self.input_x_word = tf.placeholder(tf.int32, [None, None], name="input_x_word") self.input_x_len = tf.placeholder(tf.int32, name='input_x_len') self.input_mask = tf.placeholder(tf.int32, [None, None], name='input_mask') self.input_relation = tf.placeholder(tf.int32, [None, None], name='input_relation') # 实体NER的真实标签 self.keep_prob = tf.placeholder(tf.float32, name='dropout_keep_prob') self.is_training = tf.placeholder(tf.bool, None, name='is_training') self.initializer = initializers.xavier_initializer() self.lstm_dim = self.config.lstm_dim self.relation_num = self.config.relation_num self.num_steps = tf.shape(self.input_x_word)[-1] print("self.num_steps.shape[-1]:",tf.shape(self.input_x_word)[-1]) self.bert_embed(bert_init=True) def biLSTM_layer(self, lstm_inputs, lstm_dim, lengths, name=None): """ :param lstm_inputs: [batch_size, num_steps, emb_size] :return: [batch_size, num_steps, 2*lstm_dim] """ with tf.name_scope("char_BiLSTM" if not name else name): lstm_cell = {} for direction in ["forward", "backward"]: with tf.name_scope(direction): lstm_cell[direction] = rnn.CoupledInputForgetGateLSTMCell( lstm_dim, use_peepholes=True, initializer=self.initializer, state_is_tuple=True) outputs, final_states = tf.nn.bidirectional_dynamic_rnn( lstm_cell["forward"], lstm_cell["backward"], lstm_inputs, dtype=tf.float32, sequence_length=lengths) return tf.concat(outputs, axis=2) def project_layer(self, lstm_outputs, name=None): """ hidden layer between lstm layer and logits :param lstm_outputs: [batch_size, num_steps, emb_size] :return: [batch_size, num_steps, num_tags] """ with tf.name_scope("project" if not name else name): with tf.name_scope("hidden"): W = tf.get_variable("HW", shape=[self.lstm_dim * 2, self.lstm_dim], dtype=tf.float32, initializer=self.initializer) b = tf.get_variable("Hb", shape=[self.lstm_dim], dtype=tf.float32, initializer=tf.zeros_initializer()) output = tf.reshape(lstm_outputs, shape=[-1, self.lstm_dim * 2]) hidden = tf.tanh(tf.nn.xw_plus_b(output, W, b)) # project to score of tags with tf.name_scope("logits"): W = tf.get_variable("LW", shape=[self.lstm_dim, self.relation_num], dtype=tf.float32, initializer=self.initializer) b = tf.get_variable("Lb", shape=[self.relation_num], dtype=tf.float32, initializer=tf.zeros_initializer()) pred = tf.nn.xw_plus_b(hidden, W, b) return tf.reshape(pred, [-1, self.num_steps, self.relation_num], name='pred_logits') def loss_without_crf(self, output_layer, num_labels, bert_init=True): with tf.variable_scope("loss"): self.logits = output_layer self.probabilities = tf.nn.softmax(self.logits, axis=-1) log_probs = tf.nn.log_softmax(self.logits, axis=-1) # [?,11] print("log_probs.shape:",log_probs.shape) self.predictions = tf.argmax(self.logits, axis=-1, name="predictions") one_hot_labels = tf.one_hot(self.input_relation, depth=num_labels, dtype=tf.float32) # [?,512,11] #print(one_hot_labels) #print("one_hot_labels.shape:",one_hot_labels.shape) self.per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) #print("self.per_example_loss.shape:",self.per_example_loss.shape) self.loss = tf.reduce_mean(self.per_example_loss) print(self.loss) #print("self.loss.shape:",self.loss.shape) tvars = tf.trainable_variables() init_checkpoint = self.config.bert_file assignment_map, initialized_variable_names = get_assignment_map_from_checkpoint(tvars, init_checkpoint) if bert_init: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) tf.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" print(" name = {}, shape = {}{}".format(var.name, var.shape, init_string)) print('init bert from checkpoint: {}'.format(init_checkpoint)) #return self.loss, self.per_example_loss, self.logits, self.probabilities def bert_embed(self, bert_init=True): """ 读取BERT的TF模型 :param bert_init: :return: """ num_labels = self.config.relation_num bert_config_file = self.config.bert_config_file bert_config = BertConfig.from_json_file(bert_config_file) model = BertModel( config=bert_config, is_training=self.is_training, # 微调 input_ids=self.input_x_word, input_mask=self.input_mask, token_type_ids=None, use_one_hot_embeddings=False) # If you want to use the token-level output, use model.get_sequence_output() # output_layer = model.get_pooled_output() # [?,768] # print("output_layer.shape:",output_layer) used = tf.sign(tf.abs(self.input_x_word)) length = tf.reduce_sum(used, reduction_indices=1) self.lengths = tf.cast(length, tf.int32) output_layer = model.get_sequence_output() lstm_inputs = tf.nn.dropout(output_layer, 0.9) output_layer = self.biLSTM_layer(lstm_inputs, self.lstm_dim, self.lengths) output_layer = self.project_layer(output_layer) print("output_layer.shape:", output_layer.shape) self.loss_without_crf(output_layer, num_labels) import sys sys.exit(0)
结果:
WARNING:tensorflow:From /content/drive/My Drive/Deep-Learning-With-Python/chapter8/CCF_ner/optimization.py:155: The name tf.train.AdamOptimizer is deprecated. Please use tf.compat.v1.train.AdamOptimizer instead. WARNING:tensorflow:From /content/drive/My Drive/Deep-Learning-With-Python/chapter8/CCF_ner/bert/tokenization.py:125: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead. {'锞', '蚨', '螭', '荑', 'Q', '芘', '呭', '铧', '㈠', '靊', '铖', '氺', '狻', '涜', '鍙', '暍', '閮', '椴', '茀', '锎', '莳', 'Z', 'U', '幤', 'X', '瀵', '’', 'F', 'L', '魑', '鸃', '汭', '嗾', '浐', '谡', '媖', '殚', '曁', '佺', '讣', '戋', 'I', '聃', '骟', '劼', '缁', '嶶', '锛', 'T', '椱', '寳', '叒', '燚', '歃', 'G', '骝', 'O', 'V', '笪', '楒', '赟', '`', '辔', '姇', '狴', '愠', '鹆', 'W', 'S', 'N', '茕', '莜', '叕', '鑱', 'K', '犴', '菥', '塱', '捭', 'Y', 'R', '毖', '閞', '郯', '咑', '鎸', '瘜', '劖', '嗮', 'D', '猊', '囵', '旂', '忔', '亓', '”', '邴', 'E', '龉', '檩', 'B', 'J', '鸱', '狲', '犇', '谝', '茆', '旳', '噃', '钅', '祹', '渼', '魉', '瘕', '鐢', '綉', '\ue40a', '瑧', '槊', '翀', '跬', '屃', '疄', '犰', '勮', 'C', '梿', '鐜', '撺', '跶', '釆', '嚚', '铱', '镚', '戝', '罝', 'P', '亻', '“', '祼', '褴', '睚', '貹', '铗', '庒', '鍒', '姤', '圪', '浡', '帀', '綯', '龃', '讠', '‘', '猢', '睍', '斲', '屼', 'M', 'A', 'H', '诓', '簰', '雬', '俰', '玎'} 8012 {'桭', 'C', 'T', '歀', 'Q', 'D', 'G', '靊', '烎', 'P', '”', '锎', '“', 'E', 'O', 'V', '緈', 'Z', 'J', 'B', 'U', 'X', '’', 'F', 'L', '‘', 'W', '尓', 'N', 'S', 'K', '浐', '諨', 'A', 'H', 'Y', 'M', 'R', 'I'} 1105 WARNING:tensorflow:From test_bert.py:388: The name tf.Session is deprecated. Please use tf.compat.v1.Session instead. 2020-12-13 14:07:47.209770: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1 2020-12-13 14:07:47.265991: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2020-12-13 14:07:47.266613: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1639] Found device 0 with properties: name: Tesla T4 major: 7 minor: 5 memoryClockRate(GHz): 1.59 pciBusID: 0000:00:04.0 2020-12-13 14:07:47.266923: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1 2020-12-13 14:07:47.493085: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10 2020-12-13 14:07:47.621614: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcufft.so.10 2020-12-13 14:07:47.641392: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcurand.so.10 2020-12-13 14:07:47.925153: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusolver.so.10 2020-12-13 14:07:47.943921: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusparse.so.10 2020-12-13 14:07:48.468415: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7 2020-12-13 14:07:48.468625: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2020-12-13 14:07:48.469411: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2020-12-13 14:07:48.470004: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1767] Adding visible gpu devices: 0 2020-12-13 14:07:48.525931: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2200000000 Hz 2020-12-13 14:07:48.526210: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x270ef40 initialized for platform Host (this does not guarantee that XLA will be used). Devices: 2020-12-13 14:07:48.526244: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version 2020-12-13 14:07:48.677879: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2020-12-13 14:07:48.678754: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x270f100 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices: 2020-12-13 14:07:48.678790: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Tesla T4, Compute Capability 7.5 2020-12-13 14:07:48.679588: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2020-12-13 14:07:48.680198: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1639] Found device 0 with properties: name: Tesla T4 major: 7 minor: 5 memoryClockRate(GHz): 1.59 pciBusID: 0000:00:04.0 2020-12-13 14:07:48.680265: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1 2020-12-13 14:07:48.680295: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10 2020-12-13 14:07:48.680319: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcufft.so.10 2020-12-13 14:07:48.680346: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcurand.so.10 2020-12-13 14:07:48.680371: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusolver.so.10 2020-12-13 14:07:48.680393: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusparse.so.10 2020-12-13 14:07:48.680416: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7 2020-12-13 14:07:48.680497: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2020-12-13 14:07:48.681158: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2020-12-13 14:07:48.681699: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1767] Adding visible gpu devices: 0 2020-12-13 14:07:48.684658: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1 2020-12-13 14:07:48.686073: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1180] Device interconnect StreamExecutor with strength 1 edge matrix: 2020-12-13 14:07:48.686103: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1186] 0 2020-12-13 14:07:48.686114: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1199] 0: N 2020-12-13 14:07:48.687110: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2020-12-13 14:07:48.687768: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2020-12-13 14:07:48.688359: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:39] Overriding allow_growth setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0. 2020-12-13 14:07:48.688404: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1325] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 14221 MB memory) -> physical GPU (device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5) WARNING:tensorflow:From test_bert.py:176: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead. self.num_steps.shape[-1]: Tensor("strided_slice_1:0", shape=(), dtype=int32) WARNING:tensorflow:From /content/drive/My Drive/Deep-Learning-With-Python/chapter8/CCF_ner/tf_utils/bert_modeling.py:175: The name tf.variable_scope is deprecated. Please use tf.compat.v1.variable_scope instead. WARNING:tensorflow:From /content/drive/My Drive/Deep-Learning-With-Python/chapter8/CCF_ner/tf_utils/bert_modeling.py:416: The name tf.get_variable is deprecated. Please use tf.compat.v1.get_variable instead. WARNING:tensorflow:From /content/drive/My Drive/Deep-Learning-With-Python/chapter8/CCF_ner/tf_utils/bert_modeling.py:497: The name tf.assert_less_equal is deprecated. Please use tf.compat.v1.assert_less_equal instead. WARNING:tensorflow: The TensorFlow contrib module will not be included in TensorFlow 2.0. For more information, please see: * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md * https://github.com/tensorflow/addons * https://github.com/tensorflow/io (for I/O related ops) If you depend on functionality not listed there, please file an issue. WARNING:tensorflow:From /content/drive/My Drive/Deep-Learning-With-Python/chapter8/CCF_ner/tf_utils/bert_modeling.py:364: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version. Instructions for updating: Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`. WARNING:tensorflow:From /content/drive/My Drive/Deep-Learning-With-Python/chapter8/CCF_ner/tf_utils/bert_modeling.py:874: dense (from tensorflow.python.layers.core) is deprecated and will be removed in a future version. Instructions for updating: Use keras.layers.Dense instead. WARNING:tensorflow:From /tensorflow-1.15.2/python3.6/tensorflow_core/python/layers/core.py:187: Layer.apply (from tensorflow.python.keras.engine.base_layer) is deprecated and will be removed in a future version. Instructions for updating: Please use `layer.__call__` method instead. WARNING:tensorflow:From /content/drive/My Drive/Deep-Learning-With-Python/chapter8/CCF_ner/tf_utils/bert_modeling.py:282: The name tf.erf is deprecated. Please use tf.math.erf instead. WARNING:tensorflow:From test_bert.py:209: bidirectional_dynamic_rnn (from tensorflow.python.ops.rnn) is deprecated and will be removed in a future version. Instructions for updating: Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))`, which is equivalent to this API WARNING:tensorflow:From /tensorflow-1.15.2/python3.6/tensorflow_core/python/ops/rnn.py:464: dynamic_rnn (from tensorflow.python.ops.rnn) is deprecated and will be removed in a future version. Instructions for updating: Please use `keras.layers.RNN(cell)`, which is equivalent to this API WARNING:tensorflow:Entity <bound method CoupledInputForgetGateLSTMCell.call of <tf_utils.rnncell.CoupledInputForgetGateLSTMCell object at 0x7f297e2d2eb8>> could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: module 'gast' has no attribute 'Num' WARNING:tensorflow:From /tensorflow-1.15.2/python3.6/tensorflow_core/python/ops/rnn.py:244: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version. Instructions for updating: Use tf.where in 2.0, which has the same broadcast rule as np.where WARNING:tensorflow:Entity <bound method CoupledInputForgetGateLSTMCell.call of <tf_utils.rnncell.CoupledInputForgetGateLSTMCell object at 0x7f297e2d2fd0>> could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: module 'gast' has no attribute 'Num' WARNING:tensorflow:From test_bert.py:225: The name tf.nn.xw_plus_b is deprecated. Please use tf.compat.v1.nn.xw_plus_b instead. output_layer.shape: (?, ?, 11) log_probs.shape: (?, ?, 11) self.per_example_loss.shape: (?, ?) self.loss.shape: () WARNING:tensorflow:From test_bert.py:255: The name tf.trainable_variables is deprecated. Please use tf.compat.v1.trainable_variables instead. WARNING:tensorflow:From test_bert.py:259: The name tf.train.init_from_checkpoint is deprecated. Please use tf.compat.v1.train.init_from_checkpoint instead. WARNING:tensorflow:From test_bert.py:260: The name tf.logging.info is deprecated. Please use tf.compat.v1.logging.info instead. name = bert/embeddings/word_embeddings:0, shape = (21128, 768), *INIT_FROM_CKPT* name = bert/embeddings/token_type_embeddings:0, shape = (2, 768), *INIT_FROM_CKPT* name = bert/embeddings/position_embeddings:0, shape = (512, 768), *INIT_FROM_CKPT* name = bert/embeddings/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT* name = bert/embeddings/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_0/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_0/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_0/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_0/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_0/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_0/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_0/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_0/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_0/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_0/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_0/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT* name = bert/encoder/layer_0/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT* name = bert/encoder/layer_0/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_0/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_0/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_0/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_1/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_1/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_1/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_1/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_1/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_1/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_1/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_1/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_1/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_1/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_1/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT* name = bert/encoder/layer_1/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT* name = bert/encoder/layer_1/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_1/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_1/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_1/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_2/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_2/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_2/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_2/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_2/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_2/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_2/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_2/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_2/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_2/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_2/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT* name = bert/encoder/layer_2/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT* name = bert/encoder/layer_2/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_2/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_2/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_2/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_3/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_3/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_3/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_3/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_3/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_3/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_3/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_3/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_3/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_3/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_3/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT* name = bert/encoder/layer_3/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT* name = bert/encoder/layer_3/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_3/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_3/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_3/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_4/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_4/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_4/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_4/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_4/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_4/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_4/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_4/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_4/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_4/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_4/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT* name = bert/encoder/layer_4/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT* name = bert/encoder/layer_4/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_4/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_4/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_4/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_5/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_5/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_5/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_5/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_5/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_5/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_5/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_5/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_5/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_5/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_5/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT* name = bert/encoder/layer_5/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT* name = bert/encoder/layer_5/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_5/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_5/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_5/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_6/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_6/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_6/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_6/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_6/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_6/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_6/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_6/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_6/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_6/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_6/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT* name = bert/encoder/layer_6/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT* name = bert/encoder/layer_6/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_6/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_6/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_6/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_7/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_7/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_7/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_7/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_7/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_7/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_7/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_7/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_7/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_7/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_7/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT* name = bert/encoder/layer_7/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT* name = bert/encoder/layer_7/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_7/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_7/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_7/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_8/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_8/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_8/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_8/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_8/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_8/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_8/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_8/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_8/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_8/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_8/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT* name = bert/encoder/layer_8/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT* name = bert/encoder/layer_8/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_8/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_8/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_8/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_9/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_9/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_9/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_9/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_9/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_9/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_9/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_9/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_9/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_9/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_9/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT* name = bert/encoder/layer_9/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT* name = bert/encoder/layer_9/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_9/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_9/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_9/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_10/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_10/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_10/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_10/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_10/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_10/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_10/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_10/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_10/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_10/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_10/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT* name = bert/encoder/layer_10/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT* name = bert/encoder/layer_10/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_10/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_10/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_10/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_11/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_11/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_11/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_11/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_11/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_11/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_11/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_11/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_11/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_11/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_11/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT* name = bert/encoder/layer_11/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT* name = bert/encoder/layer_11/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT* name = bert/encoder/layer_11/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_11/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT* name = bert/encoder/layer_11/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT* name = bert/pooler/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT* name = bert/pooler/dense/bias:0, shape = (768,), *INIT_FROM_CKPT* name = bidirectional_rnn/fw/coupled_input_forget_gate_lstm_cell/_w_xi:0, shape = (768, 256) name = bidirectional_rnn/fw/coupled_input_forget_gate_lstm_cell/_w_hi:0, shape = (256, 256) name = bidirectional_rnn/fw/coupled_input_forget_gate_lstm_cell/_w_ci:0, shape = (256, 256) name = bidirectional_rnn/fw/coupled_input_forget_gate_lstm_cell/_w_xo:0, shape = (768, 256) name = bidirectional_rnn/fw/coupled_input_forget_gate_lstm_cell/_w_ho:0, shape = (256, 256) name = bidirectional_rnn/fw/coupled_input_forget_gate_lstm_cell/_w_co:0, shape = (256, 256) name = bidirectional_rnn/fw/coupled_input_forget_gate_lstm_cell/_w_xc:0, shape = (768, 256) name = bidirectional_rnn/fw/coupled_input_forget_gate_lstm_cell/_w_hc:0, shape = (256, 256) name = bidirectional_rnn/fw/coupled_input_forget_gate_lstm_cell/_b_i:0, shape = (256,) name = bidirectional_rnn/fw/coupled_input_forget_gate_lstm_cell/_b_c:0, shape = (256,) name = bidirectional_rnn/fw/coupled_input_forget_gate_lstm_cell/_b_o:0, shape = (256,) name = bidirectional_rnn/bw/coupled_input_forget_gate_lstm_cell/_w_xi:0, shape = (768, 256) name = bidirectional_rnn/bw/coupled_input_forget_gate_lstm_cell/_w_hi:0, shape = (256, 256) name = bidirectional_rnn/bw/coupled_input_forget_gate_lstm_cell/_w_ci:0, shape = (256, 256) name = bidirectional_rnn/bw/coupled_input_forget_gate_lstm_cell/_w_xo:0, shape = (768, 256) name = bidirectional_rnn/bw/coupled_input_forget_gate_lstm_cell/_w_ho:0, shape = (256, 256) name = bidirectional_rnn/bw/coupled_input_forget_gate_lstm_cell/_w_co:0, shape = (256, 256) name = bidirectional_rnn/bw/coupled_input_forget_gate_lstm_cell/_w_xc:0, shape = (768, 256) name = bidirectional_rnn/bw/coupled_input_forget_gate_lstm_cell/_w_hc:0, shape = (256, 256) name = bidirectional_rnn/bw/coupled_input_forget_gate_lstm_cell/_b_i:0, shape = (256,) name = bidirectional_rnn/bw/coupled_input_forget_gate_lstm_cell/_b_c:0, shape = (256,) name = bidirectional_rnn/bw/coupled_input_forget_gate_lstm_cell/_b_o:0, shape = (256,) name = HW:0, shape = (512, 256) name = Hb:0, shape = (256,) name = LW:0, shape = (256, 11) name = Lb:0, shape = (11,) init bert from checkpoint: /content/drive/MyDrive/Deep-Learning-With-Python/chapter8/CCF_ner/bert_pretrained/bert_model.ckpt WARNING:tensorflow:From test_bert.py:392: The name tf.train.exponential_decay is deprecated. Please use tf.compat.v1.train.exponential_decay instead. bert train variable num: 199 normal train variable num: 26 word2vec trainable!!
说明:
我们可以直接调用官方的tensorflow的bert模型来使用bert,接下来,我们使用output_layer = model.get_sequence_output()来获得最后一层的特征,然后接下来在添加bilstm层,
对于bilstm的前向和反向的输出进行拼接后,经过一个project_layer()函数计算logits,最后再经过一个损失层计算损失和其它的一些预测的值等。同时我们要将预训练bert模型的参数导入到bert中。
这里面我们可以通过这种方式计算每个序列的长度:
used = tf.sign(tf.abs(self.input_x_word))
length = tf.reduce_sum(used, reduction_indices=1) self.lengths = tf.cast(length, tf.int32)
当然,在喂入数据的时候,我们也已经传入了长度了,可以酌情使用。
当bert+bilstm之后,一般而言bert微调的学习率和bilstm的学习率是要设置成不同的,比如一下代码:
# 超参数设置 global_step = tf.Variable(0, name='step', trainable=False) learning_rate = tf.train.exponential_decay(config.learning_rate, global_step, config.decay_step, config.decay_rate, staircase=True) normal_optimizer = tf.train.AdamOptimizer(learning_rate) # 下接结构的学习率 all_variables = graph.get_collection('trainable_variables') word2vec_var_list = [x for x in all_variables if 'bert' in x.name] # BERT的参数 normal_var_list = [x for x in all_variables if 'bert' not in x.name] # 下接结构的参数 print('bert train variable num: {}'.format(len(word2vec_var_list))) print('normal train variable num: {}'.format(len(normal_var_list))) normal_op = normal_optimizer.minimize(model.loss, global_step=global_step, var_list=normal_var_list) num_batch = int(train_iter.num_records / config.batch_size * config.train_epoch) embed_step = tf.Variable(0, name='step', trainable=False) if word2vec_var_list: # 对BERT微调 print('word2vec trainable!!') word2vec_op, embed_learning_rate, embed_step = create_optimizer( model.loss, config.embed_learning_rate, num_train_steps=num_batch, num_warmup_steps=int(num_batch * 0.05) , use_tpu=False , variable_list=word2vec_var_list ) train_op = tf.group(normal_op, word2vec_op) # 组装BERT与下接结构参数 else: train_op = normal_op
一般bert+bilstm之后还需要接一个crf(条件随机场),我们下节继续。