#!/usr/bin/env python
# -*- coding: utf-8 -*-
__all__ = [
"Term"
]
import re
from typing import Callable, Dict, Generator, List, Pattern, TypeVar, Union
T = TypeVar("T") # pylint: disable=C0103
[docs]def list_to_generator(input_list: List[T]) -> Generator[T, None, None]:
"""Yield element from list, repeating the very last element infinitely.
Args:
input_list (List[T]): List of elements.
Yields:
T: Element of `input_list`
"""
while True:
yield input_list[0]
if len(input_list) > 1:
input_list = input_list[1:]
def _transform(func: Callable[[str], str]) -> Callable[[str], str]:
"""If `func` is called with "i" or "I", then return "I", otherwise simply call `func`
Args:
func (Callable[[str], str]): Function for converting the casing of an input string.
Returns:
str: Input string converted according to `func`'s casing rules.
"""
return lambda word: "I" if word.lower() == "i" else func(word)
[docs]class Term:
"""`Term` is the base class of the `Noun`, `Verb`, `Adjective` subclasses,
and holds some default implementations of methods used across these
subclasses.
Method docstrings from this class are inherited to the subclasses' methods.
"""
# Supported casing formats: I, lower, Title, UPPER, Mc
# Note that if the passed word is "i", we always output "I"
_casing_formats: Dict[str, Dict[str, Union[Pattern[str], Callable[[str], str]]]] = {
"I": {
"regex": re.compile(r"^I$"),
"transformation": _transform(str.lower)
},
"lower": {
"regex": re.compile(r"^[^A-Z]+$"),
"transformation": _transform(str.lower)
},
"title": {
"regex": re.compile(r"^[A-Z][^A-Z]+$"),
"transformation": _transform(str.title)
},
"upper": {
"regex": re.compile(r"^[A-Z]+s?$"),
"transformation": _transform(lambda word:
word[:-1].upper() + word[-1]
if word.endswith(("s", "S"))
else word.upper()
)
},
"Mc": {
"regex": re.compile(r"^Mc[A-Z][^A-Z]+$"),
"transformation": _transform(lambda word:
"Mc" + word[2:].title()
if word.lower().startswith("mc")
else word.title()
)
},
}
# Regex for finding a word
_word_regex = re.compile(r"([^\r\n\t\f\v\-\' ]+)")
# Regex for extracting whitespace before and after input
_whitespace_regex = re.compile(r"(?P<start>^\s*).*?(?P<end>\s*$)")
[docs] def __init__(self, term: str):
"""Creates class instance with detection and conversion methods.
Note:
Capitalisation and whitespace will be preserved between input `term` and
generated output.
Args:
term (str): Input word or collocation.
"""
super().__init__()
# Whitestring strings before and after the terms
self.start = ""
self.end = ""
# Default format for the separator between words
self.spaces = None
self.term = term.strip()
# Extract whitespace before and after the term
if term.startswith(" ") or term.endswith(" "):
self.start, self.end = Term._whitespace_regex.match(term).groups() # type: ignore
# If there is troublesome double whitespace, find the substrings
# between words and normalize them
# NOTE: Assume there are no tabs, newlines, etc. in the input terms
if " " in self.term or "-" in self.term:
self.spaces = re.findall(r"([\r\n\t\f\v\- ]+)", self.term)
if " " in self.term:
self.term = re.sub(r"\s{2,}", " ", self.term)
[docs] def is_noun(self) -> bool: # pylint: disable=R0201
"""Returns `True` only if this object is instantiated via `Noun(term)`.
Returns:
bool: Returns `True` only if this object is instantiated via `Noun(term)`.
"""
return False
[docs] def is_verb(self) -> bool: # pylint: disable=R0201
"""Returns `True` only if this object is instantiated via `Verb(term)`.
Returns:
bool: Returns `True` only if this object is instantiated via `Verb(term)`.
"""
return False
[docs] def is_adj(self) -> bool: # pylint: disable=R0201
"""Returns `True` only if this object is instantiated via `Adjective(term)`.
Returns:
bool: Returns `True` only if this object is instantiated via `Adjective(term)`.
"""
return False
[docs] def is_singular(self) -> bool:
"""Detect whether this object is in singular form.
Returns:
bool: True if this object is deemed singular.
"""
raise NotImplementedError()
[docs] def is_plural(self) -> bool:
"""Detect whether this object is in plural form.
Returns:
bool: True if this object is deemed plural.
"""
raise NotImplementedError()
[docs] def singular(self, person: int = 0) -> str:
"""Returns this object's singular form.
Args:
person (Optional[int], optional): Represents the grammatical "person" (1st, 2nd, 3rd).
This option only affects personal and possessive pronouns, possessive adjectives,
and verbs. Defaults to 0.
Returns:
str: This object's singular form.
"""
raise NotImplementedError()
[docs] def plural(self, person: int = 0) -> str:
"""Returns this object's plural form.
Args:
person (Optional[int], optional): Represents the grammatical "person" (1st, 2nd, 3rd).
This option only affects personal and possessive pronouns, possessive adjectives,
and verbs. Defaults to 0.
Returns:
str: This object's plural form.
"""
raise NotImplementedError()
[docs] def lemma(self) -> str:
"""Return this object's lemma form.
Returns:
str: This object's lemma form.
"""
raise NotImplementedError()
[docs] def classical(self) -> "Term":
"""Returns an object always inflecting in the classical/unassimilated manner.
Examples:
>>> Noun('cow').plural()
'cows'
>>> Noun('cow').unassimilated().plural()
'kine'
Note:
Identical to `unassimilated()`.
Returns:
Term: A Term object, or a subclass thereof.
"""
return self
[docs] def unassimilated(self) -> "Term":
"""Returns an object always inflecting in the classical/unassimilated manner.
Examples:
>>> Noun('cow').plural()
'cows'
>>> Noun('cow').unassimilated().plural()
'kine'
Note:
Identical to `classical()`.
Returns:
Term: A Term object, or a subclass thereof.
"""
return self.classical() # pragma: no cover
def _check_valid_person(self, person: int) -> bool: # pylint: disable=R0201
"""Return True if `person` is valid, i.e. in [0, 1, 2, 3].
Otherwise, return False and output a warning stating that the
`person` parameter is invalid.
Args:
person (int): Represents the grammatical "person" (1st, 2nd, 3rd).
Raises:
UserWarning: If `person` is invalid, this warning is thrown.
Returns:
bool: True if `person` is valid, i.e. in [0, 1, 2, 3]. False Otherwise.
"""
if person not in [0, 1, 2, 3]:
raise ValueError(
"Invalid `person` parameter supplied. Valid values include 0, 1, 2, and 3.")
return True
[docs] def as_regex(self) -> Pattern[str]:
"""Returns a `re.Pattern` which case-insensitively matches any inflected form of the word.
Returns:
re.Pattern: Compiled regex object which case-insensitively matches any inflected form
of the word.
Examples:
>>> Noun('cherub').as_regex()
re.compile('cherubs|cherubim|cherub', re.IGNORECASE)
>>> Verb('eat').as_regex()
re.compile('eats|eating|eaten|eat|ate', re.IGNORECASE)
"""
return re.compile("|".join(sorted(map(re.escape, {self.singular(), # type: ignore
self.plural()}), reverse=True)),
flags=re.I)
def __repr__(self) -> str:
"""Return `repr(self)`.
Examples:
>>> noun = Noun("book")
>>> f"My noun: {noun!r}"
"My noun: Noun('book')"
"""
return f"{self.__class__.__name__}({self._reapply_whitespace(self.term)!r})"
def _encase(self, target: str) -> str:
"""Apply casing from `self.term` string onto `target` string.
TODO: Currently "show--off" -> "show----off"
TODO: self.term as i-th and target as i-th -> I-th
: Perhaps don't force capitalize I if followed by a hyphen.
TODO: Let self.term as ABC and target as ABCs convert to ABCs rather than ABCS
: Also consider that some Noun tests may be broken
Args:
target (str): The word or collocation on which to apply the casing
that exists on `self.term`.
Returns:
str: `target`, but encased according to the patterns applied on `self.term`.
"""
# Split off 's
suffix = ""
if target.endswith("'s"):
target = target[:-2]
suffix = "'s"
# Special case for 'I'
if self.term == "I" or target == "I":
return self._reapply_whitespace(target + suffix)
# Get list of lambda functions that correspond to the
# casing formats for `original`.
transformations: List[Callable[[str], str]] = []
for word in Term._word_regex.findall(self.term):
for casing_format in Term._casing_formats.values():
if casing_format["regex"].match(word): # type: ignore
transformations.append(casing_format["transformation"]) # type: ignore
break
else:
# If no casing regexes matches
transformations.append(lambda word: word)
# If no words found in term, just return target
if not transformations:
return self._reapply_whitespace(target + suffix)
# Generator that gets next transformation until there is
# just one transformation left, after which it will
# continuously yield that last transformation
# Apply the transformations found in `original` to `target`
transformations_gen = list_to_generator(transformations)
# Phrase is target, but with the proper casing from the term applied
phrase = Term._word_regex.sub(
lambda match_obj: next(transformations_gen)(match_obj.group()),
target)
return self._reapply_whitespace(phrase + suffix)
def _reapply_whitespace(self, phrase: str) -> str:
"""Reapply whitespace formats before, after and within a phrase.
Based on `self.start`, `self.end` and `self.spaces` which were saved at __init__().
Args:
phrase (str): The word or collocation on which whitespace and hyphens are added.
Returns:
str: `phrase`, but with whitespace before, after and within a phrase.
"""
if self.spaces:
spaces_iter = iter(self.spaces)
return self.start +\
re.sub("-| ",
lambda _: next(spaces_iter),
phrase.strip(),
count=len(self.spaces)) +\
self.end
return self.start + phrase.strip() + self.end