Source code for span_marker.evaluation

import warnings
from typing import Dict

import evaluate
import torch
from sklearn.exceptions import UndefinedMetricWarning
from transformers import EvalPrediction

from span_marker.tokenizer import SpanMarkerTokenizer


[docs] def compute_f1_via_seqeval( tokenizer: SpanMarkerTokenizer, eval_prediction: EvalPrediction, is_in_train: bool ) -> Dict[str, float]: """Compute micro-F1, recall, precision and accuracy scores using ``seqeval`` for the evaluation predictions. Note: We assume that samples are not shuffled for the evaluation/prediction. With other words, don't use this on the (shuffled) train dataset! Args: tokenizer (SpanMarkerTokenizer): The model its tokenizer. eval_prediction (~transformers.EvalPrediction): The predictions resulting from the evaluations. Returns: Dict[str, float]: Dictionary with ``"overall_precision"``, ``"overall_recall"``, ``"overall_f1"`` and ``"overall_accuracy"`` keys. """ inputs = eval_prediction.inputs gold_labels = eval_prediction.label_ids logits = eval_prediction.predictions[0] num_words = eval_prediction.predictions[2] has_document_context = len(eval_prediction.predictions) == 5 if has_document_context: document_ids = eval_prediction.predictions[3] sentence_ids = eval_prediction.predictions[4] # Compute probabilities via softmax and extract 'winning' scores/labels probs = torch.tensor(logits, dtype=torch.float32).softmax(dim=-1) scores, pred_labels = probs.max(-1) # Collect all samples in one dict. We do this because some samples are spread between multiple inputs sample_list = [] for sample_idx in range(inputs.shape[0]): tokens = inputs[sample_idx] text = tokenizer.decode(tokens, skip_special_tokens=True) token_hash = hash(text) if not has_document_context else (document_ids[sample_idx], sentence_ids[sample_idx]) if ( not sample_list or sample_list[-1]["hash"] != token_hash or len(sample_list[-1]["spans"]) == len(sample_list[-1]["gold_labels"]) ): mask = gold_labels[sample_idx] != -100 spans = list(tokenizer.get_all_valid_spans(num_words[sample_idx], tokenizer.config.entity_max_length)) sample_list.append( { "text": text, "gold_labels": gold_labels[sample_idx][mask].tolist(), "pred_labels": pred_labels[sample_idx][mask].tolist(), "scores": scores[sample_idx].tolist(), "num_words": num_words[sample_idx], "hash": token_hash, "spans": spans, } ) else: mask = gold_labels[sample_idx] != -100 sample_list[-1]["gold_labels"] += gold_labels[sample_idx][mask].tolist() sample_list[-1]["pred_labels"] += pred_labels[sample_idx][mask].tolist() sample_list[-1]["scores"] += scores[sample_idx].tolist() outside_id = tokenizer.config.outside_id id2label = tokenizer.config.id2label # seqeval works wonders for NER evaluation seqeval = evaluate.load("seqeval") for sample in sample_list: scores = sample["scores"] num_words = sample["num_words"] spans = sample["spans"] gold_labels = sample["gold_labels"] pred_labels = sample["pred_labels"] assert len(gold_labels) == len(pred_labels) and len(spans) == len(pred_labels) # Construct IOB2 format for gold labels, useful for seqeval gold_labels_per_tokens = ["O"] * num_words for span, gold_label in zip(spans, gold_labels): if gold_label != outside_id: gold_labels_per_tokens[span[0]] = "B-" + id2label[gold_label] gold_labels_per_tokens[span[0] + 1 : span[1]] = ["I-" + id2label[gold_label]] * (span[1] - span[0] - 1) # Same for predictions, note that we place most likely spans first and we disallow overlapping spans for now. pred_labels_per_tokens = ["O"] * num_words for _, span, pred_label in sorted(zip(scores, spans, pred_labels), key=lambda tup: tup[0], reverse=True): if pred_label != outside_id and all(pred_labels_per_tokens[i] == "O" for i in range(span[0], span[1])): pred_labels_per_tokens[span[0]] = "B-" + id2label[pred_label] pred_labels_per_tokens[span[0] + 1 : span[1]] = ["I-" + id2label[pred_label]] * (span[1] - span[0] - 1) seqeval.add(prediction=pred_labels_per_tokens, reference=gold_labels_per_tokens) with warnings.catch_warnings(): warnings.simplefilter("ignore", UndefinedMetricWarning) results = seqeval.compute() if is_in_train: return {key: value for key, value in results.items() if isinstance(value, float)} return results