Source code for inflex.term

#!/usr/bin/env python
# -*- coding: utf-8 -*-

__all__ = [
    "Term"
]

import re
from typing import Callable, Dict, Generator, List, Pattern, TypeVar, Union

T = TypeVar("T")  # pylint: disable=C0103


[docs]def list_to_generator(input_list: List[T]) -> Generator[T, None, None]: """Yield element from list, repeating the very last element infinitely. Args: input_list (List[T]): List of elements. Yields: T: Element of `input_list` """ while True: yield input_list[0] if len(input_list) > 1: input_list = input_list[1:]
def _transform(func: Callable[[str], str]) -> Callable[[str], str]: """If `func` is called with "i" or "I", then return "I", otherwise simply call `func` Args: func (Callable[[str], str]): Function for converting the casing of an input string. Returns: str: Input string converted according to `func`'s casing rules. """ return lambda word: "I" if word.lower() == "i" else func(word)
[docs]class Term: """`Term` is the base class of the `Noun`, `Verb`, `Adjective` subclasses, and holds some default implementations of methods used across these subclasses. Method docstrings from this class are inherited to the subclasses' methods. """ # Supported casing formats: I, lower, Title, UPPER, Mc # Note that if the passed word is "i", we always output "I" _casing_formats: Dict[str, Dict[str, Union[Pattern[str], Callable[[str], str]]]] = { "I": { "regex": re.compile(r"^I$"), "transformation": _transform(str.lower) }, "lower": { "regex": re.compile(r"^[^A-Z]+$"), "transformation": _transform(str.lower) }, "title": { "regex": re.compile(r"^[A-Z][^A-Z]+$"), "transformation": _transform(str.title) }, "upper": { "regex": re.compile(r"^[A-Z]+s?$"), "transformation": _transform(lambda word: word[:-1].upper() + word[-1] if word.endswith(("s", "S")) else word.upper() ) }, "Mc": { "regex": re.compile(r"^Mc[A-Z][^A-Z]+$"), "transformation": _transform(lambda word: "Mc" + word[2:].title() if word.lower().startswith("mc") else word.title() ) }, } # Regex for finding a word _word_regex = re.compile(r"([^\r\n\t\f\v\-\' ]+)") # Regex for extracting whitespace before and after input _whitespace_regex = re.compile(r"(?P<start>^\s*).*?(?P<end>\s*$)")
[docs] def __init__(self, term: str): """Creates class instance with detection and conversion methods. Note: Capitalisation and whitespace will be preserved between input `term` and generated output. Args: term (str): Input word or collocation. """ super().__init__() # Whitestring strings before and after the terms self.start = "" self.end = "" # Default format for the separator between words self.spaces = None self.term = term.strip() # Extract whitespace before and after the term if term.startswith(" ") or term.endswith(" "): self.start, self.end = Term._whitespace_regex.match(term).groups() # type: ignore # If there is troublesome double whitespace, find the substrings # between words and normalize them # NOTE: Assume there are no tabs, newlines, etc. in the input terms if " " in self.term or "-" in self.term: self.spaces = re.findall(r"([\r\n\t\f\v\- ]+)", self.term) if " " in self.term: self.term = re.sub(r"\s{2,}", " ", self.term)
[docs] def is_noun(self) -> bool: # pylint: disable=R0201 """Returns `True` only if this object is instantiated via `Noun(term)`. Returns: bool: Returns `True` only if this object is instantiated via `Noun(term)`. """ return False
[docs] def is_verb(self) -> bool: # pylint: disable=R0201 """Returns `True` only if this object is instantiated via `Verb(term)`. Returns: bool: Returns `True` only if this object is instantiated via `Verb(term)`. """ return False
[docs] def is_adj(self) -> bool: # pylint: disable=R0201 """Returns `True` only if this object is instantiated via `Adjective(term)`. Returns: bool: Returns `True` only if this object is instantiated via `Adjective(term)`. """ return False
[docs] def is_singular(self) -> bool: """Detect whether this object is in singular form. Returns: bool: True if this object is deemed singular. """ raise NotImplementedError()
[docs] def is_plural(self) -> bool: """Detect whether this object is in plural form. Returns: bool: True if this object is deemed plural. """ raise NotImplementedError()
[docs] def singular(self, person: int = 0) -> str: """Returns this object's singular form. Args: person (Optional[int], optional): Represents the grammatical "person" (1st, 2nd, 3rd). This option only affects personal and possessive pronouns, possessive adjectives, and verbs. Defaults to 0. Returns: str: This object's singular form. """ raise NotImplementedError()
[docs] def plural(self, person: int = 0) -> str: """Returns this object's plural form. Args: person (Optional[int], optional): Represents the grammatical "person" (1st, 2nd, 3rd). This option only affects personal and possessive pronouns, possessive adjectives, and verbs. Defaults to 0. Returns: str: This object's plural form. """ raise NotImplementedError()
[docs] def lemma(self) -> str: """Return this object's lemma form. Returns: str: This object's lemma form. """ raise NotImplementedError()
[docs] def classical(self) -> "Term": """Returns an object always inflecting in the classical/unassimilated manner. Examples: >>> Noun('cow').plural() 'cows' >>> Noun('cow').unassimilated().plural() 'kine' Note: Identical to `unassimilated()`. Returns: Term: A Term object, or a subclass thereof. """ return self
[docs] def unassimilated(self) -> "Term": """Returns an object always inflecting in the classical/unassimilated manner. Examples: >>> Noun('cow').plural() 'cows' >>> Noun('cow').unassimilated().plural() 'kine' Note: Identical to `classical()`. Returns: Term: A Term object, or a subclass thereof. """ return self.classical() # pragma: no cover
def _check_valid_person(self, person: int) -> bool: # pylint: disable=R0201 """Return True if `person` is valid, i.e. in [0, 1, 2, 3]. Otherwise, return False and output a warning stating that the `person` parameter is invalid. Args: person (int): Represents the grammatical "person" (1st, 2nd, 3rd). Raises: UserWarning: If `person` is invalid, this warning is thrown. Returns: bool: True if `person` is valid, i.e. in [0, 1, 2, 3]. False Otherwise. """ if person not in [0, 1, 2, 3]: raise ValueError( "Invalid `person` parameter supplied. Valid values include 0, 1, 2, and 3.") return True
[docs] def as_regex(self) -> Pattern[str]: """Returns a `re.Pattern` which case-insensitively matches any inflected form of the word. Returns: re.Pattern: Compiled regex object which case-insensitively matches any inflected form of the word. Examples: >>> Noun('cherub').as_regex() re.compile('cherubs|cherubim|cherub', re.IGNORECASE) >>> Verb('eat').as_regex() re.compile('eats|eating|eaten|eat|ate', re.IGNORECASE) """ return re.compile("|".join(sorted(map(re.escape, {self.singular(), # type: ignore self.plural()}), reverse=True)), flags=re.I)
def __repr__(self) -> str: """Return `repr(self)`. Examples: >>> noun = Noun("book") >>> f"My noun: {noun!r}" "My noun: Noun('book')" """ return f"{self.__class__.__name__}({self._reapply_whitespace(self.term)!r})" def _encase(self, target: str) -> str: """Apply casing from `self.term` string onto `target` string. TODO: Currently "show--off" -> "show----off" TODO: self.term as i-th and target as i-th -> I-th : Perhaps don't force capitalize I if followed by a hyphen. TODO: Let self.term as ABC and target as ABCs convert to ABCs rather than ABCS : Also consider that some Noun tests may be broken Args: target (str): The word or collocation on which to apply the casing that exists on `self.term`. Returns: str: `target`, but encased according to the patterns applied on `self.term`. """ # Split off 's suffix = "" if target.endswith("'s"): target = target[:-2] suffix = "'s" # Special case for 'I' if self.term == "I" or target == "I": return self._reapply_whitespace(target + suffix) # Get list of lambda functions that correspond to the # casing formats for `original`. transformations: List[Callable[[str], str]] = [] for word in Term._word_regex.findall(self.term): for casing_format in Term._casing_formats.values(): if casing_format["regex"].match(word): # type: ignore transformations.append(casing_format["transformation"]) # type: ignore break else: # If no casing regexes matches transformations.append(lambda word: word) # If no words found in term, just return target if not transformations: return self._reapply_whitespace(target + suffix) # Generator that gets next transformation until there is # just one transformation left, after which it will # continuously yield that last transformation # Apply the transformations found in `original` to `target` transformations_gen = list_to_generator(transformations) # Phrase is target, but with the proper casing from the term applied phrase = Term._word_regex.sub( lambda match_obj: next(transformations_gen)(match_obj.group()), target) return self._reapply_whitespace(phrase + suffix) def _reapply_whitespace(self, phrase: str) -> str: """Reapply whitespace formats before, after and within a phrase. Based on `self.start`, `self.end` and `self.spaces` which were saved at __init__(). Args: phrase (str): The word or collocation on which whitespace and hyphens are added. Returns: str: `phrase`, but with whitespace before, after and within a phrase. """ if self.spaces: spaces_iter = iter(self.spaces) return self.start +\ re.sub("-| ", lambda _: next(spaces_iter), phrase.strip(), count=len(self.spaces)) +\ self.end return self.start + phrase.strip() + self.end