model_name = "t5-small"
max_len = 512
bs = 16
val_bs = bs*2
lr = 2e-5
# datasets = load_dataset("xsum")
train_ds = load_dataset("xsum", split='validation')
tokenizer = AutoTokenizer.from_pretrained(model_name)
# train_ds = concatenate_datasets([datasets['train'], datasets['validation']])
splits = RandomSplitter()(train_ds)
@ItemTransform
def untuple1(x):
return (*x[0], )
dblock = DataBlock(
blocks = [TransformersTextBlock(tokenizer=tokenizer, do_targets=True, with_labels=True)],
get_x=TextGetter('document', 'summary', prefix1='summarize: '),
item_tfms=untuple1,
splitter=IndexSplitter(splits[1]))
%%time
dls = dblock.dataloaders(train_ds, bs=bs, val_bs=bs*2, shuffle=True)
dls.show_batch(max_n=4)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
learn = TransLearner(dls, model, loss_func=noop)
learn.add_cb(RougeScore(tokenizer));
learn.validate()
learn.fit_one_cycle(2, 1e-4)
%%time
res = learn.validate()
display_validation_results(res)
So far we computed predictions using single forward pass so token generated at timestep $t$ has access to reference tokens $0:t-1$. But this at inference time we will generate autoregressively previously generated tokens are used to generate the next one. Let's evaluete the model with this more realistic procedure. This can be done by adding GeneratePreds
callback:
%%time
res = learn.validate(cbs=GeneratePreds())
display_validation_results(res)
sample = train_ds[0]
document_text = ' '.join(sample['document'].split('\n'))
print(f"Document:\n{document_text}")
print(f"\nReference summary: {sample['summary']}")
inp = tokenizer('summarize: '+sample['document'], return_tensors='pt')
pred = learn.generate(inp['input_ids'].to(dls.device))
out = tokenizer.decode(pred[0].cpu(), skip_special_tokens=True)
print(f"\nPredicted summary: {out}")