Keras 教程:BERT 文本摘要
- 2020 年 8 月 26 日
- AI
字幕組雙語原文:Keras 教程:BERT 文本摘要
英語原文:BERT (from HuggingFace Transformers) for Text Extraction
介紹
這個演示使用了SQuAD (Stanford question – answer Dataset)。在SQuAD 數據集中,輸入由一個問題和一個上下文段落組成。目標是找到回答問題的段落的跨度。我們使用「精確匹配(Exact Match)」指標來評估我們在這些數據上的表現,它度量了精確匹配任何一個真實答案的預測的百分比。
我們對一個BERT模型進行微調,如下所示:
-
將上下文和問題作為輸入,輸入給BERT。
-
取兩個向量S和T它們的維數等於BERT中隱藏狀態的維數。
-
計算每個token作為答案範圍的開始和結束的概率。一個token作為答案開始的概率是由S和在最後一層BERT中表示的token之間的點積給出的,然後是所有token的softmax。token作為最終答案的概率的計算方法與向量T類似。
-
微調BERT,學習S和T。
參考:
設置:
import os import re import json import string import numpy as np import tensorflow as tf from tensorflow import keras from tensorflow.keras import layers from tokenizers import BertWordPieceTokenizer from transformers import BertTokenizer,TFBertModel,Bert Configmax_len = 384 configuration = BertConfig() # default paramters and configuration for BERT |
設置BERT分詞器
# Save the slow pretrained tokenizerslow_tokenizer = BertTokenizer.from_pretrained(“bert-base-uncased”)save_path = “bert_base_uncased/”if not os.path.exists(save_path): os.makedirs(save_path)slow_tokenizer.save_pretrained(save_path)# Load the fast tokenizer from saved filetokenizer = BertWordPieceTokenizer(“bert_base_uncased/vocab.txt”, lowercase=True) |
載入數據
train_data_url = “//rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json”train_path = keras.utils.get_file(“train.json”, train_data_url)eval_data_url = “//rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json”eval_path = keras.utils.get_file(“eval.json”, eval_data_url) |
數據預處理
-
遍歷JSON文件,把每行記錄都保存為SquadExample對象.
-
遍歷每個SquadExample對象來創建x_train, y_train, x_eval, y_eval.
class SquadExample: def __init__(self, question, context, start_char_idx, answer_text, all_answers): self.question = question self.context = context self.start_char_idx = start_char_idx self.answer_text = answer_text self.all_answers = all_answers self.skip = False def preprocess(self): context = self.context question = self.question answer_text = self.answer_text start_char_idx = self.start_char_idx # Clean context, answer and question context = ” “.join(str(context).split()) question = ” “.join(str(question).split()) answer = ” “.join(str(answer_text).split()) # Find end character index of answer in context end_char_idx = start_char_idx + len(answer) if end_char_idx >= len(context): self.skip = True return # Mark the character indexes in context that are in answer is_char_in_ans = [0] * len(context) for idx in range(start_char_idx, end_char_idx): is_char_in_ans[idx] = 1 # Tokenize context tokenized_context = tokenizer.encode(context) # Find tokens that were created from answer characters ans_token_idx = [] for idx, (start, end) in enumerate(tokenized_context.offsets): if sum(is_char_in_ans[start:end]) > 0: ans_token_idx.append(idx) if len(ans_token_idx) == 0: self.skip = True return # Find start and end token index for tokens from answer start_token_idx = ans_token_idx[0] end_token_idx = ans_token_idx[-1] # Tokenize question tokenized_question = tokenizer.encode(question) # Create inputs input_ids = tokenized_context.ids + tokenized_question.ids[1:] token_type_ids = [0] * len(tokenized_context.ids) + [1] * len( tokenized_question.ids[1:] ) attention_mask = [1] * len(input_ids) # Pad and create attention masks. # Skip if truncation is needed padding_length = max_len – len(input_ids) if padding_length > 0: # pad input_ids = input_ids + ([0] * padding_length) attention_mask = attention_mask + ([0] * padding_length) token_type_ids = token_type_ids + ([0] * padding_length) elif padding_length < 0: # skip self.skip = True return self.input_ids = input_ids self.token_type_ids = token_type_ids self.attention_mask = attention_mask self.start_token_idx = start_token_idx self.end_token_idx = end_token_idx self.context_token_to_char = tokenized_context.offsetswith open(train_path) as f: raw_train_data = json.load(f)with open(eval_path) as f: raw_eval_data = json.load(f)def create_squad_examples(raw_data): squad_examples = [] for item in raw_data[“data”]: for para in item[“paragraphs”]: context = para[“context”] for qa in para[“qas”]: question = qa[“question”] answer_text = qa[“answers”][0][“text”] all_answers = [_[“text”] for _ in qa[“answers”]] start_char_idx = qa[“answers”][0][“answer_start”] squad_eg = SquadExample( question, context, start_char_idx, answer_text, all_answers ) squad_eg.preprocess() squad_examples.append(squad_eg) return squad_examplesdef create_inputs_targets(squad_examples): dataset_dict = { “input_ids”: [], “token_type_ids”: [], “attention_mask”: [], “start_token_idx”: [], “end_token_idx”: [], } for item in squad_examples: if item.skip == False: for key in dataset_dict: dataset_dict[key].append(getattr(item, key)) for key in dataset_dict: dataset_dict[key] = np.array(dataset_dict[key]) x = [ dataset_dict[“input_ids”], dataset_dict[“token_type_ids”], dataset_dict[“attention_mask”], ] y = [dataset_dict[“start_token_idx”], dataset_dict[“end_token_idx”]] return x, ytrain_squad_examples = create_squad_examples(raw_train_data)x_train, y_train = create_inputs_targets(train_squad_examples)print(f”{len(train_squad_examples)} training points created.”)eval_squad_examples = create_squad_examples(raw_eval_data)x_eval, y_eval = create_inputs_targets(eval_squad_examples)print(f”{len(eval_squad_examples)} evaluation points created.”) |
87599 training points created.10570 evaluation points created. |
用BERT和函數式API來構建問答模組
def create_model(): ## BERT encoder encoder = TFBertModel.from_pretrained(“bert-base-uncased”) ## QA Model input_ids = layers.Input(shape=(max_len,), dtype=tf.int32) token_type_ids = layers.Input(shape=(max_len,), dtype=tf.int32) attention_mask = layers.Input(shape=(max_len,), dtype=tf.int32) embedding = encoder( input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask )[0] start_logits = layers.Dense(1, name=”start_logit”, use_bias=False)(embedding) start_logits = layers.Flatten()(start_logits) end_logits = layers.Dense(1, name=”end_logit”, use_bias=False)(embedding) end_logits = layers.Flatten()(end_logits) start_probs = layers.Activation(keras.activations.softmax)(start_logits) end_probs = layers.Activation(keras.activations.softmax)(end_logits) model = keras.Model( inputs=[input_ids, token_type_ids, attention_mask], outputs=[start_probs, end_probs], ) loss = keras.losses.SparseCategoricalCrossentropy(from_logits=False) optimizer = keras.optimizers.Adam(lr=5e-5) model.compile(optimizer=optimizer, loss=[loss, loss]) return model |
這段程式碼很適合用Google Colab TPU來跑. 用Colab TPUs, 每個epoch大概花5-6分鐘即可.
use_tpu = Trueif use_tpu: |
構建評價回調函數
這個回調函數會在每個epoch後用驗證集數據計算匹配值.
def normalize_text(text): text = text.lower() # Remove punctuations exclude = set(string.punctuation) text = “”.join(ch for ch in text if ch not in exclude) # Remove articles regex = re.compile(r”\b(a|an|the)\b”, re.UNICODE) text = re.sub(regex, ” “, text) # Remove extra white space text = ” “.join(text.split()) return textclass ExactMatch(keras.callbacks.Callback): “”” Each `SquadExample` object contains the character level offsets for each token in its input paragraph. We use them to get back the span of text corresponding to the tokens between our predicted start and end tokens. All the ground-truth answers are also present in each `SquadExample` object. We calculate the percentage of data points where the span of text obtained from model predictions matches one of the ground-truth answers. “”” def __init__(self, x_eval, y_eval): self.x_eval = x_eval self.y_eval = y_eval def on_epoch_end(self, epoch, logs=None): pred_start, pred_end = self.model.predict(self.x_eval) count = 0 eval_examples_no_skip = [_ for _ in eval_squad_examples if _.skip == False] for idx, (start, end) in enumerate(zip(pred_start, pred_end)): squad_eg = eval_examples_no_skip[idx] offsets = squad_eg.context_token_to_char start = np.argmax(start) end = np.argmax(end) if start >= len(offsets): continue pred_char_start = offsets[start][0] if end < len(offsets): pred_char_end = offsets[end][1] pred_ans = squad_eg.context[pred_char_start:pred_char_end] else: pred_ans = squad_eg.context[pred_char_start:] normalized_pred_ans = normalize_text(pred_ans) normalized_true_ans = [normalize_text(_) for _ in squad_eg.all_answers] if normalized_pred_ans in normalized_true_ans: count += 1 acc = count / len(self.y_eval[0]) print(f”\nepoch={epoch+1}, exact match score={acc:.2f}”) |
訓練和評估
exact_match_callback = ExactMatch(x_eval, y_eval)model.fit( x_train, y_train, epochs=1, # For demonstration, 3 epochs are recommended verbose=2, batch_size=64, callbacks=[exact_match_callback],) |
epoch=1, exact match score=0.781346/1346 – 350s – activation_7_loss: 1.3488 – loss: 2.5905 – activation_8_loss: 1.2417<tensorflow.python.keras.callbacks.History at 0x7fc78b4458d0> |
雷鋒字幕組是一個由 AI 愛好者組成的翻譯團隊,匯聚五百多位志願者的力量,分享最新的海外AI資訊,交流關於人工智慧技術領域的行業變革與技術創新的見解。
團隊成員有大數據專家、演算法工程師、影像處理工程師、產品經理、產品運營、IT諮詢人、在校師生;志願者們來自IBM、AVL、Adobe、阿里、百度等知名企業,北大、清華、港大、中科院、南卡羅萊納大學、早稻田大學等海內外高校研究所。
如果,你也是位熱愛分享的AI愛好者。歡迎與雷鋒字幕組一起,學習新知,分享成長。
雷鋒網雷鋒網