Source code for span_marker.data_collator

from collections import defaultdict
from dataclasses import dataclass
from typing import Any, Dict, List

import torch
from torch.nn import functional as F

from span_marker.tokenizer import SpanMarkerTokenizer


[docs]@dataclass
class SpanMarkerDataCollator:
    """
    Data Collator class responsible for converting the minimal outputs from the tokenizer into
    complete and meaningful inputs to the model. In particular, the ``input_ids`` from the tokenizer
    features are padded, and the correct amount of start and end markers (with padding) are added.

    Furthermore, the position IDs are generated for the input IDs, and ``start_position_ids`` and
    ``end_position_ids`` are used alongside some padding to create a full position ID vector.

    Lastly, the attention matrix is computed.

    The expected usage is something like:

    >>> collator = SpanMarkerDataCollator(...)
    >>> tokenized = tokenizer(...)
    >>> batch = collator(tokenized)
    >>> output = model(**batch)
    """

    tokenizer: SpanMarkerTokenizer
    marker_max_length: int

[docs]    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
        """Convert the minimal tokenizer outputs into inputs ready for :meth:`~span_marker.modeling.SpanMarkerModel.forward`.

        Args:
            features (List[Dict[str, Any]]): A list of dictionaries, one element per sample in the batch.
                The dictionaries contain the following keys:

                * ``input_ids``: The non-padded input IDs.
                * ``num_spans``: The number of spans that should be encoded in each sample.
                * ``start_position_ids``: The position IDs of the start markers in the sample.
                * ``end_position_ids``: The position IDs of the end markers in the sample.
                * ``labels`` (optional): The labels corresponding to each of the spans in the sample.
                * ``num_words`` (optional): The number of words in the input sample.
                    Required for some evaluation metrics.

        Returns:
            Dict[str, torch.Tensor]: Batch dictionary ready to be fed into :meth:`~span_marker.modeling.SpanMarkerModel.forward`.
        """
        total_size = self.tokenizer.model_max_length + 2 * self.marker_max_length
        batch = defaultdict(list)
        num_words = []
        document_ids = []
        sentence_ids = []
        start_marker_indices = []
        num_marker_pairs = []
        for sample in features:
            input_ids = sample["input_ids"]
            num_spans = sample["num_spans"]
            num_tokens = len(input_ids)

            # The start markers start after the input IDs, rounded up to the nearest even number
            start_marker_idx = num_tokens + num_tokens % 2
            end_marker_idx = start_marker_idx + num_spans

            # Prepare input_ids by padding and adding start and end markers
            if not isinstance(input_ids, torch.Tensor):
                input_ids = torch.tensor(input_ids, dtype=torch.int)
            else:
                input_ids.to(torch.int)
            input_ids = F.pad(input_ids, (0, total_size - len(input_ids)), value=self.tokenizer.pad_token_id)
            input_ids[start_marker_idx : start_marker_idx + num_spans] = self.tokenizer.start_marker_id
            input_ids[end_marker_idx : end_marker_idx + num_spans] = self.tokenizer.end_marker_id
            batch["input_ids"].append(input_ids)

            # Prepare position IDs
            position_ids = torch.arange(num_tokens, dtype=torch.int) + 2
            position_ids = F.pad(position_ids, (0, total_size - len(position_ids)), value=1)
            position_ids[start_marker_idx : start_marker_idx + num_spans] = (
                torch.tensor(sample["start_position_ids"]) + 2
            )
            position_ids[end_marker_idx : end_marker_idx + num_spans] = torch.tensor(sample["end_position_ids"]) + 2
            # Increase the position_ids by 2, inspired by PL-Marker. The intuition is that these position IDs
            # better match the circumstances under which the underlying encoders are trained.
            batch["position_ids"].append(position_ids)

            # Prepare attention mask matrix
            attention_mask = torch.zeros((total_size, total_size), dtype=torch.bool)
            # text tokens self-attention
            attention_mask[:num_tokens, :num_tokens] = 1
            # let markers attend text tokens
            attention_mask[start_marker_idx : start_marker_idx + num_spans, :num_tokens] = 1
            attention_mask[end_marker_idx : end_marker_idx + num_spans, :num_tokens] = 1
            # self-attentions of start/end markers
            start_index_list = list(range(start_marker_idx, start_marker_idx + num_spans))
            end_index_list = list(range(end_marker_idx, end_marker_idx + num_spans))
            attention_mask[start_index_list, start_index_list] = 1
            attention_mask[start_index_list, end_index_list] = 1
            attention_mask[end_index_list, start_index_list] = 1
            attention_mask[end_index_list, end_index_list] = 1
            batch["attention_mask"].append(attention_mask)

            # Add start of the markers, so the model knows where the input IDs end and where the markers start
            start_marker_indices.append(start_marker_idx)
            num_marker_pairs.append(end_marker_idx - start_marker_idx)

            if "num_words" in sample:
                num_words.append(sample["num_words"])
            if "document_id" in sample:
                document_ids.append(sample["document_id"])
            if "sentence_id" in sample:
                sentence_ids.append(sample["sentence_id"])
            if "labels" in sample:
                labels = torch.tensor(sample["labels"])
                labels = F.pad(labels, (0, (total_size // 2) - len(labels)), value=-100)
                batch["labels"].append(labels)

        batch = {key: torch.stack(value) for key, value in batch.items()}
        # Used for evaluation, does not need to be padded/stacked
        if num_words:
            batch["num_words"] = torch.tensor(num_words)
        if document_ids:
            batch["document_ids"] = torch.tensor(document_ids)
        if sentence_ids:
            batch["sentence_ids"] = torch.tensor(sentence_ids)
        batch["start_marker_indices"] = torch.tensor(start_marker_indices)
        batch["num_marker_pairs"] = torch.tensor(num_marker_pairs)
        return batch