From 56b644f1db9bfe026aa667f22923e281df33e22b Mon Sep 17 00:00:00 2001 From: Brett Smith Date: Wed, 6 May 2020 10:26:25 -0400 Subject: [PATCH] meta_entity: More battle testing. See the test cases for examples of real entities in the books that we should accept for now. --- conservancy_beancount/plugin/meta_entity.py | 54 +++++++++++--------- tests/test_meta_entity.py | 55 ++++++++++++++++++--- 2 files changed, 81 insertions(+), 28 deletions(-) diff --git a/conservancy_beancount/plugin/meta_entity.py b/conservancy_beancount/plugin/meta_entity.py index 6f9d130..4370170 100644 --- a/conservancy_beancount/plugin/meta_entity.py +++ b/conservancy_beancount/plugin/meta_entity.py @@ -25,39 +25,51 @@ from . import core from .. import data from .. import errors as errormod from ..beancount_types import ( + MetaKey, + MetaValue, Transaction, ) from typing import ( + MutableMapping, + Optional, Pattern, + Tuple, ) class MetaEntity(core.TransactionHook): METADATA_KEY = 'entity' HOOK_GROUPS = frozenset(['posting', 'metadata', METADATA_KEY]) - # alnum is the set of characters we always accept in entity metadata: - # letters and digits, minus the Latin 1 supplement (i.e., Roman letters - # with diacritics: áÁàÀâÂåÅäÄãà çÇ ðÐ ñÑ øØ ß etc.) + # chars is the set of characters we always accept in entity metadata: + # letters, digits, and ASCII punctuation, except `-` and the Latin 1 supplement + # (i.e., Roman letters with diacritics: áÁàÀâÂåÅäÄãà çÇ ðÐ ñÑ øØ ß etc.) # See the tests for specific cases. - alnum = r'\p{Letter}\p{Digit}--\p{Block=Latin_1_Supplement}' - # A regexp that would be reasonably stricter would be: - # f'^[{alnum}][.{alnum}]*(?:-[.{alnum}])*$' - # However, current producers fail that regexp in a few different ways. - # See the tests for specific cases. - ENTITY_RE: Pattern[str] = regex.compile(f'^[{alnum}][-.{alnum}]*$', regex.VERSION1) - del alnum + chars = r'\u0021-\u002c\u002e-\u007e\p{Letter}\p{Digit}--\p{Block=Latin_1_Supplement}' + ENTITY_RE: Pattern[str] = regex.compile(f'^[{chars}][-{chars}]*$', regex.VERSION1) + ANONYMOUS_RE: Pattern[str] = regex.compile(r'^[-_.?!\s]*$', regex.VERSION1) + del chars + + def _check_entity(self, + meta: MutableMapping[MetaKey, MetaValue], + default: Optional[str]=None, + ) -> Tuple[Optional[str], Optional[bool]]: + entity = meta.get(self.METADATA_KEY, default) + if entity is None: + return None, None + elif not isinstance(entity, str): + return None, False + elif self.ANONYMOUS_RE.match(entity): + entity = 'Anonymous' + meta[self.METADATA_KEY] = entity + return entity, True + else: + return entity, self.ENTITY_RE.match(entity) is not None def run(self, txn: Transaction) -> errormod.Iter: if data.is_opening_balance_txn(txn): return - txn_entity = txn.meta.get(self.METADATA_KEY, txn.payee) - if txn_entity is None: - txn_entity_ok = None - elif isinstance(txn_entity, str): - txn_entity_ok = bool(self.ENTITY_RE.match(txn_entity)) - else: - txn_entity_ok = False + txn_entity, txn_entity_ok = self._check_entity(txn.meta, txn.payee) if txn_entity_ok is False: yield errormod.InvalidMetadataError(txn, self.METADATA_KEY, txn_entity) for post in data.Posting.from_txn(txn): @@ -68,10 +80,8 @@ class MetaEntity(core.TransactionHook): 'Liabilities:Payable', ): continue - entity = post.meta.get(self.METADATA_KEY) - if entity is None: - yield errormod.InvalidMetadataError(txn, self.METADATA_KEY, entity, post) - elif entity is txn_entity: + entity, entity_ok = self._check_entity(post.meta, txn_entity) + if entity is txn_entity and entity is not None: pass - elif not self.ENTITY_RE.match(entity): + elif not entity_ok: yield errormod.InvalidMetadataError(txn, self.METADATA_KEY, entity, post) diff --git a/tests/test_meta_entity.py b/tests/test_meta_entity.py index 0711453..5cd4c6d 100644 --- a/tests/test_meta_entity.py +++ b/tests/test_meta_entity.py @@ -37,8 +37,14 @@ VALID_VALUES = { 'スミスダコタ', 'スミス-ダコタ', 'Яшин-Данила', - # The PayPal importer produces . in entity metadata + # Governments, using : as a hierarchy separator + 'BE', + 'US:KY', + 'CA:ON', + # The PayPal importer allows ASCII punctuation in entity metadata 'Du-Bois-W.-E.-B.', + "O'Malley-Thomas", + 'O`Malley-Thomas', # import2ledger produces entities that end with - # That's probably a bug, but allow it for now. 'foo-', @@ -47,7 +53,6 @@ VALID_VALUES = { INVALID_VALUES = { # Starting with a - is not allowed '-foo', - '-', # Names that can be reduced to ASCII should be # Producers should change this to Uberentity or Ueberentity # I am not wild about this rule and would like to relax it—it's mostly @@ -56,14 +61,25 @@ INVALID_VALUES = { # mangling producers are expected to do. But it's the rule for today. 'Überentity', # Whitespace is never allowed - ' ', 'Alex Smith', '田中\u00A0流星', # Non-breaking space - # The only punctuation allowed is - and . - 'スミス_ダコタ', + # Non-ASCII punctuation is not allowed 'Яшин—Данила', # em dash - # An empty string is not valid + 'O’Malley-Thomas', # Right-angled apostrophe + 'Du-Bois-W。-E。-B。', # Japanese period +} + +ANONYMOUS_VALUES = { + # Values produced by various importers that should be translated to + # Anonymous. '', + ' ', + '-', + '--', + '-----', + '_', + ' _ ', + '.', } TEST_KEY = 'entity' @@ -81,6 +97,15 @@ def test_valid_values_on_postings(hook, src_value): ]) assert not any(hook.run(txn)) +@pytest.mark.parametrize('src_value', ANONYMOUS_VALUES) +def test_anonymous_values_on_postings(hook, src_value): + txn = testutil.Transaction(postings=[ + ('Assets:Cash', -25), + ('Expenses:General', 25, {TEST_KEY: src_value}), + ]) + assert not any(hook.run(txn)) + assert txn.postings[-1].meta[TEST_KEY] == 'Anonymous' + @pytest.mark.parametrize('src_value', INVALID_VALUES) def test_invalid_values_on_postings(hook, src_value): txn = testutil.Transaction(postings=[ @@ -99,6 +124,15 @@ def test_valid_values_on_transactions(hook, src_value): ]) assert not any(hook.run(txn)) +@pytest.mark.parametrize('src_value', ANONYMOUS_VALUES) +def test_anonymous_values_on_transactions(hook, src_value): + txn = testutil.Transaction(**{TEST_KEY: src_value}, postings=[ + ('Assets:Cash', -25), + ('Expenses:General', 25), + ]) + assert not any(hook.run(txn)) + assert txn.meta[TEST_KEY] == 'Anonymous' + @pytest.mark.parametrize('src_value', INVALID_VALUES) def test_invalid_values_on_transactions(hook, src_value): txn = testutil.Transaction(**{TEST_KEY: src_value}, postings=[ @@ -118,6 +152,15 @@ def test_valid_values_on_payee(hook, src_value): ]) assert not any(hook.run(txn)) +@pytest.mark.parametrize('src_value', ANONYMOUS_VALUES) +def test_anonymous_values_on_payee(hook, src_value): + txn = testutil.Transaction(payee=src_value, postings=[ + ('Assets:Cash', -25), + ('Expenses:General', 25), + ]) + assert not any(hook.run(txn)) + assert txn.meta[TEST_KEY] == 'Anonymous' + @pytest.mark.parametrize('src_value', INVALID_VALUES) def test_invalid_values_on_payee(hook, src_value): txn = testutil.Transaction(payee=src_value, postings=[