meta_entity: More battle testing.

See the test cases for examples of real entities in the books
that we should accept for now.
This commit is contained in:
Brett Smith 2020-05-06 10:26:25 -04:00
parent 072937eff5
commit 56b644f1db
2 changed files with 81 additions and 28 deletions

View file

@ -25,39 +25,51 @@ from . import core
from .. import data
from .. import errors as errormod
from ..beancount_types import (
MetaKey,
MetaValue,
Transaction,
)
from typing import (
MutableMapping,
Optional,
Pattern,
Tuple,
)
class MetaEntity(core.TransactionHook):
METADATA_KEY = 'entity'
HOOK_GROUPS = frozenset(['posting', 'metadata', METADATA_KEY])
# alnum is the set of characters we always accept in entity metadata:
# letters and digits, minus the Latin 1 supplement (i.e., Roman letters
# with diacritics: áÁàÀâÂåÅäÄãà çÇ ðÐ ñÑ øØ ß etc.)
# chars is the set of characters we always accept in entity metadata:
# letters, digits, and ASCII punctuation, except `-` and the Latin 1 supplement
# (i.e., Roman letters with diacritics: áÁàÀâÂåÅäÄãà çÇ ðÐ ñÑ øØ ß etc.)
# See the tests for specific cases.
alnum = r'\p{Letter}\p{Digit}--\p{Block=Latin_1_Supplement}'
# A regexp that would be reasonably stricter would be:
# f'^[{alnum}][.{alnum}]*(?:-[.{alnum}])*$'
# However, current producers fail that regexp in a few different ways.
# See the tests for specific cases.
ENTITY_RE: Pattern[str] = regex.compile(f'^[{alnum}][-.{alnum}]*$', regex.VERSION1)
del alnum
chars = r'\u0021-\u002c\u002e-\u007e\p{Letter}\p{Digit}--\p{Block=Latin_1_Supplement}'
ENTITY_RE: Pattern[str] = regex.compile(f'^[{chars}][-{chars}]*$', regex.VERSION1)
ANONYMOUS_RE: Pattern[str] = regex.compile(r'^[-_.?!\s]*$', regex.VERSION1)
del chars
def _check_entity(self,
meta: MutableMapping[MetaKey, MetaValue],
default: Optional[str]=None,
) -> Tuple[Optional[str], Optional[bool]]:
entity = meta.get(self.METADATA_KEY, default)
if entity is None:
return None, None
elif not isinstance(entity, str):
return None, False
elif self.ANONYMOUS_RE.match(entity):
entity = 'Anonymous'
meta[self.METADATA_KEY] = entity
return entity, True
else:
return entity, self.ENTITY_RE.match(entity) is not None
def run(self, txn: Transaction) -> errormod.Iter:
if data.is_opening_balance_txn(txn):
return
txn_entity = txn.meta.get(self.METADATA_KEY, txn.payee)
if txn_entity is None:
txn_entity_ok = None
elif isinstance(txn_entity, str):
txn_entity_ok = bool(self.ENTITY_RE.match(txn_entity))
else:
txn_entity_ok = False
txn_entity, txn_entity_ok = self._check_entity(txn.meta, txn.payee)
if txn_entity_ok is False:
yield errormod.InvalidMetadataError(txn, self.METADATA_KEY, txn_entity)
for post in data.Posting.from_txn(txn):
@ -68,10 +80,8 @@ class MetaEntity(core.TransactionHook):
'Liabilities:Payable',
):
continue
entity = post.meta.get(self.METADATA_KEY)
if entity is None:
yield errormod.InvalidMetadataError(txn, self.METADATA_KEY, entity, post)
elif entity is txn_entity:
entity, entity_ok = self._check_entity(post.meta, txn_entity)
if entity is txn_entity and entity is not None:
pass
elif not self.ENTITY_RE.match(entity):
elif not entity_ok:
yield errormod.InvalidMetadataError(txn, self.METADATA_KEY, entity, post)

View file

@ -37,8 +37,14 @@ VALID_VALUES = {
'スミスダコタ',
'スミス-ダコタ',
'Яшин-Данила',
# The PayPal importer produces . in entity metadata
# Governments, using : as a hierarchy separator
'BE',
'US:KY',
'CA:ON',
# The PayPal importer allows ASCII punctuation in entity metadata
'Du-Bois-W.-E.-B.',
"O'Malley-Thomas",
'O`Malley-Thomas',
# import2ledger produces entities that end with -
# That's probably a bug, but allow it for now.
'foo-',
@ -47,7 +53,6 @@ VALID_VALUES = {
INVALID_VALUES = {
# Starting with a - is not allowed
'-foo',
'-',
# Names that can be reduced to ASCII should be
# Producers should change this to Uberentity or Ueberentity
# I am not wild about this rule and would like to relax it—it's mostly
@ -56,14 +61,25 @@ INVALID_VALUES = {
# mangling producers are expected to do. But it's the rule for today.
'Überentity',
# Whitespace is never allowed
' ',
'Alex Smith',
'田中\u00A0流星', # Non-breaking space
# The only punctuation allowed is - and .
'スミス_ダコタ',
# Non-ASCII punctuation is not allowed
'Яшин—Данила', # em dash
# An empty string is not valid
'OMalley-Thomas', # Right-angled apostrophe
'Du-Bois-W。-E。-B。', # Japanese period
}
ANONYMOUS_VALUES = {
# Values produced by various importers that should be translated to
# Anonymous.
'',
' ',
'-',
'--',
'-----',
'_',
' _ ',
'.',
}
TEST_KEY = 'entity'
@ -81,6 +97,15 @@ def test_valid_values_on_postings(hook, src_value):
])
assert not any(hook.run(txn))
@pytest.mark.parametrize('src_value', ANONYMOUS_VALUES)
def test_anonymous_values_on_postings(hook, src_value):
txn = testutil.Transaction(postings=[
('Assets:Cash', -25),
('Expenses:General', 25, {TEST_KEY: src_value}),
])
assert not any(hook.run(txn))
assert txn.postings[-1].meta[TEST_KEY] == 'Anonymous'
@pytest.mark.parametrize('src_value', INVALID_VALUES)
def test_invalid_values_on_postings(hook, src_value):
txn = testutil.Transaction(postings=[
@ -99,6 +124,15 @@ def test_valid_values_on_transactions(hook, src_value):
])
assert not any(hook.run(txn))
@pytest.mark.parametrize('src_value', ANONYMOUS_VALUES)
def test_anonymous_values_on_transactions(hook, src_value):
txn = testutil.Transaction(**{TEST_KEY: src_value}, postings=[
('Assets:Cash', -25),
('Expenses:General', 25),
])
assert not any(hook.run(txn))
assert txn.meta[TEST_KEY] == 'Anonymous'
@pytest.mark.parametrize('src_value', INVALID_VALUES)
def test_invalid_values_on_transactions(hook, src_value):
txn = testutil.Transaction(**{TEST_KEY: src_value}, postings=[
@ -118,6 +152,15 @@ def test_valid_values_on_payee(hook, src_value):
])
assert not any(hook.run(txn))
@pytest.mark.parametrize('src_value', ANONYMOUS_VALUES)
def test_anonymous_values_on_payee(hook, src_value):
txn = testutil.Transaction(payee=src_value, postings=[
('Assets:Cash', -25),
('Expenses:General', 25),
])
assert not any(hook.run(txn))
assert txn.meta[TEST_KEY] == 'Anonymous'
@pytest.mark.parametrize('src_value', INVALID_VALUES)
def test_invalid_values_on_payee(hook, src_value):
txn = testutil.Transaction(payee=src_value, postings=[