ds_name = "conll2003"
model_name = "google/electra-small-discriminator"
max_len = 512
bs = 16
val_bs = bs*2
lr = 3e-5
ds = load_dataset(ds_name)
splits = get_splits(ds)
ds = concatenate_datasets([ds['train'], ds['validation']])
ds[0]
task = 'ner' # 'pos', 'chunk'
label_vocab = ds.features[f"{task}_tags"].feature.names
label_vocab
tokenizer = AutoTokenizer.from_pretrained(model_name)
label_all_tokens = True
The preprocessing is explained in HuggingFace example notebook.
def tokenize_and_align_labels(examples):
tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
labels = []
for i, label in enumerate(examples[f"{task}_tags"]):
word_ids = tokenized_inputs.word_ids(batch_index=i)
previous_word_idx = None
label_ids = []
for word_idx in word_ids:
# Special tokens have a word id that is None. We set the label to -100 so they are automatically
# ignored in the loss function.
if word_idx is None:
label_ids.append(-100)
# We set the label for the first token of each word.
elif word_idx != previous_word_idx:
label_ids.append(label[word_idx])
# For the other tokens in a word, we set the label to either the current label or -100, depending on
# the label_all_tokens flag.
else:
label_ids.append(label[word_idx] if label_all_tokens else -100)
previous_word_idx = word_idx
labels.append(label_ids)
tokenized_inputs["labels"] = labels
return tokenized_inputs
ds = ds.map(tokenize_and_align_labels, batched=True)
dblock = DataBlock(
blocks = [TokenClassificationBlock(tokenizer=tokenizer, label_vocab=label_vocab)],
get_x=KeyGetter(['input_ids', 'attention_mask', 'token_type_ids', 'labels']),
splitter=RandomSplitter())
%%time
dls = dblock.dataloaders(ds, bs=bs, val_bs=val_bs, num_workes=2)
dls.show_batch(max_n=4)
from datasets import load_metric
seqeval = load_metric("seqeval")
print(seqeval)
from fasthugs.metrics import MetricCallback
from typing import Tuple
class Seqeval(MetricCallback):
def __init__(self, label_list, scores:Tuple[str]=('accuracy', 'f1', 'precision', 'recall')):
self.metric = load_metric('seqeval')
store_attr()
self._register_value_funcs()
@staticmethod
def _get_score(obj, score, **kwargs):
return obj.res[f"overall_{score}"]
def preprocess(self, predictions, labels):
# Remove ignored index (special tokens)
true_predictions = [
[self.label_list[p] for (p, l) in zip(prediction, label) if l != -100]
for prediction, label in zip(predictions, labels)
]
true_labels = [
[self.label_list[l] for (p, l) in zip(prediction, label) if l != -100]
for prediction, label in zip(predictions, labels)
]
return true_predictions, true_labels
wandb.init(reinit=True, project="fasthugs", entity="fastai_community",
name=WANDB_NAME, group=GROUP, notes=NOTES, tags=TAGS, config=CONFIG);
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_vocab))
learn = TransLearner(dls, model, loss_func=noop, cbs=Seqeval(label_vocab))
The final layers of the model are initialized with random weights, we can varufy that the performance is as good as random choice:
learn.validate()
cbs = []
learn.fit_one_cycle(2, lr, wd=0.01, cbs=cbs)