mlm数据处理实现解读

2021-06-23

之前在看mask language model时，如何做数据处理那里一直没有太本质理解，比如15%做mask,然后又80%做mask,10%不变，10%随机选择，所以这里将mlm数据处理部分的代码列出来，方便需要者可以看到bert是怎么实现的。

另外关于mlm分词wordpience还是整词甚者ngram，这里不做探讨，本质来讲就是数据处理的方式不同，其他对于训练和上游使用预训练模型来讲并不影响。

import collections
import random


MaskedLmInstance = collections.namedtuple("MaskedLmInstance", ["index", "label"])


def create_masked_lm_predictions(
        tokens,
        masked_lm_prob,
        max_predictions_per_seq,
        vocab_words,
        rng
):
    """
    Creates the predictions for the masked LM objective.
    :param tokens: 输入文本
    :param masked_lm_prob: 掩码语言模型的掩码概率
    :param max_predictions_per_seq: 每个序列的最大预测数目
    :param vocab_words: 词表列表
    :param rng: 随机数生成器
    """

    cand_indexes = []
    for (i, token) in enumerate(tokens):
        if token == "[CLS]" or token == "[SEP]":
            continue
        cand_indexes.append(i)

    rng.shuffle(cand_indexes)

    output_tokens = list(tokens)

    num_to_predict = min(max_predictions_per_seq, max(1, int(round(len(tokens) * masked_lm_prob))))

    masked_lms = []
    covered_indexes = set()
    for index in cand_indexes:
        if len(masked_lms) >= num_to_predict:
            break
        if index in covered_indexes:
            continue
        covered_indexes.add(index)

        masked_token = None
        # 80% of the time, replace with [MASK]
        if rng.random() < 0.8:
            masked_token = "[MASK]"
        else:
            # 10% of the time, keep original
            if rng.random() < 0.5:
                masked_token = tokens[index]
            # 10% of the time, replace with random word
            else:
                masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)]

        output_tokens[index] = masked_token

        masked_lms.append(MaskedLmInstance(index=index, label=tokens[index]))

    masked_lms = sorted(masked_lms, key=lambda x: x.index)

    masked_lm_positions = []
    masked_lm_labels = []
    for p in masked_lms:
        masked_lm_positions.append(p.index)
        masked_lm_labels.append(p.label)

    return output_tokens, masked_lm_positions, masked_lm_labels


if __name__ == '__main__':
    test_input = ['i', 'love', 'you', 'do', 'you', 'like', 'me']
    test_vocab = ['i', 'love', 'you', 'do', 'like', 'me']

    out1, out2, out3 = create_masked_lm_predictions(
        tokens=test_input,
        masked_lm_prob=0.5,
        max_predictions_per_seq=512,
        vocab_words=test_vocab,
        rng=random.Random()

    )