from transformers import AutoTokenizer, AutoModelForMaskedLM
from datasets import load_dataset, concatenate_datasets
from fastai.text.all import DataBlock, IndexSplitter, noop, perplexity
from fasthugs.learner import TransLearner
from fasthugs.data import TransformersLMBlock
model_name = 'distilroberta-base'
# data
max_length = 128
bs = 16
val_bs = bs*4
# training
lr = 3e-5
Data preprocessing
In this example notebook we use HuggingFace datasets for preprocessing (as show in example notebook here).
ds_name = 'imdb'
dataset = load_dataset(ds_name)
dataset = dataset['train'].select(range(2000))
dataset.column_names
tokenizer = AutoTokenizer.from_pretrained(model_name)
def tokenize(batch):
return tokenizer(batch['text'], return_attention_mask=True, return_special_tokens_mask=True, verbose=False)
dataset.info.task_templates = []
dataset = dataset.map(tokenize, batched=True, batch_size=100, remove_columns=dataset.column_names, num_proc=4)
block_size = max_length
def group_texts(examples):
# Concatenate all texts.
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs.
total_length = (total_length // block_size) * block_size
# Split by chunks of max_len.
result = {
k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
for k, t in concatenated_examples.items()
}
result["labels"] = result["input_ids"].copy()
return result
lm_dataset = dataset.map(
group_texts,
batched=True,
batch_size=1000,
num_proc=4,
)
import random
N = len(lm_dataset)
idx = list(range(N))
random.shuffle(idx)
split = int(N*0.9)
train_idx = idx[:split]
valid_idx = idx[split:]
dblock = DataBlock(blocks=[TransformersLMBlock(tokenizer=tokenizer)],
splitter=IndexSplitter(valid_idx))
dls = dblock.dataloaders(lm_dataset, bs=bs, val_bs=val_bs, num_workers=4)
dls.show_batch()
b = dls.one_batch()
b[0]['input_ids'], b[0]['labels']
The labels are constructed by DataCollatorForLanguageModeling
and the loss computed by the model is used for training.
model = AutoModelForMaskedLM.from_pretrained(model_name)
learn = TransLearner(dls, model, loss_func=noop, metrics=perplexity).to_fp16()
As masking is done randomly on the fly, validation score may vary.
learn.validate()
learn.fit_flat_cos(2, 3e-5)
learn.validate()