meta_entity: More battle testing.
See the test cases for examples of real entities in the books that we should accept for now.
This commit is contained in:
parent
072937eff5
commit
56b644f1db
2 changed files with 81 additions and 28 deletions
|
@ -25,39 +25,51 @@ from . import core
|
||||||
from .. import data
|
from .. import data
|
||||||
from .. import errors as errormod
|
from .. import errors as errormod
|
||||||
from ..beancount_types import (
|
from ..beancount_types import (
|
||||||
|
MetaKey,
|
||||||
|
MetaValue,
|
||||||
Transaction,
|
Transaction,
|
||||||
)
|
)
|
||||||
|
|
||||||
from typing import (
|
from typing import (
|
||||||
|
MutableMapping,
|
||||||
|
Optional,
|
||||||
Pattern,
|
Pattern,
|
||||||
|
Tuple,
|
||||||
)
|
)
|
||||||
|
|
||||||
class MetaEntity(core.TransactionHook):
|
class MetaEntity(core.TransactionHook):
|
||||||
METADATA_KEY = 'entity'
|
METADATA_KEY = 'entity'
|
||||||
HOOK_GROUPS = frozenset(['posting', 'metadata', METADATA_KEY])
|
HOOK_GROUPS = frozenset(['posting', 'metadata', METADATA_KEY])
|
||||||
|
|
||||||
# alnum is the set of characters we always accept in entity metadata:
|
# chars is the set of characters we always accept in entity metadata:
|
||||||
# letters and digits, minus the Latin 1 supplement (i.e., Roman letters
|
# letters, digits, and ASCII punctuation, except `-` and the Latin 1 supplement
|
||||||
# with diacritics: áÁàÀâÂåÅäÄãà çÇ ðÐ ñÑ øØ ß etc.)
|
# (i.e., Roman letters with diacritics: áÁàÀâÂåÅäÄãà çÇ ðÐ ñÑ øØ ß etc.)
|
||||||
# See the tests for specific cases.
|
# See the tests for specific cases.
|
||||||
alnum = r'\p{Letter}\p{Digit}--\p{Block=Latin_1_Supplement}'
|
chars = r'\u0021-\u002c\u002e-\u007e\p{Letter}\p{Digit}--\p{Block=Latin_1_Supplement}'
|
||||||
# A regexp that would be reasonably stricter would be:
|
ENTITY_RE: Pattern[str] = regex.compile(f'^[{chars}][-{chars}]*$', regex.VERSION1)
|
||||||
# f'^[{alnum}][.{alnum}]*(?:-[.{alnum}])*$'
|
ANONYMOUS_RE: Pattern[str] = regex.compile(r'^[-_.?!\s]*$', regex.VERSION1)
|
||||||
# However, current producers fail that regexp in a few different ways.
|
del chars
|
||||||
# See the tests for specific cases.
|
|
||||||
ENTITY_RE: Pattern[str] = regex.compile(f'^[{alnum}][-.{alnum}]*$', regex.VERSION1)
|
def _check_entity(self,
|
||||||
del alnum
|
meta: MutableMapping[MetaKey, MetaValue],
|
||||||
|
default: Optional[str]=None,
|
||||||
|
) -> Tuple[Optional[str], Optional[bool]]:
|
||||||
|
entity = meta.get(self.METADATA_KEY, default)
|
||||||
|
if entity is None:
|
||||||
|
return None, None
|
||||||
|
elif not isinstance(entity, str):
|
||||||
|
return None, False
|
||||||
|
elif self.ANONYMOUS_RE.match(entity):
|
||||||
|
entity = 'Anonymous'
|
||||||
|
meta[self.METADATA_KEY] = entity
|
||||||
|
return entity, True
|
||||||
|
else:
|
||||||
|
return entity, self.ENTITY_RE.match(entity) is not None
|
||||||
|
|
||||||
def run(self, txn: Transaction) -> errormod.Iter:
|
def run(self, txn: Transaction) -> errormod.Iter:
|
||||||
if data.is_opening_balance_txn(txn):
|
if data.is_opening_balance_txn(txn):
|
||||||
return
|
return
|
||||||
txn_entity = txn.meta.get(self.METADATA_KEY, txn.payee)
|
txn_entity, txn_entity_ok = self._check_entity(txn.meta, txn.payee)
|
||||||
if txn_entity is None:
|
|
||||||
txn_entity_ok = None
|
|
||||||
elif isinstance(txn_entity, str):
|
|
||||||
txn_entity_ok = bool(self.ENTITY_RE.match(txn_entity))
|
|
||||||
else:
|
|
||||||
txn_entity_ok = False
|
|
||||||
if txn_entity_ok is False:
|
if txn_entity_ok is False:
|
||||||
yield errormod.InvalidMetadataError(txn, self.METADATA_KEY, txn_entity)
|
yield errormod.InvalidMetadataError(txn, self.METADATA_KEY, txn_entity)
|
||||||
for post in data.Posting.from_txn(txn):
|
for post in data.Posting.from_txn(txn):
|
||||||
|
@ -68,10 +80,8 @@ class MetaEntity(core.TransactionHook):
|
||||||
'Liabilities:Payable',
|
'Liabilities:Payable',
|
||||||
):
|
):
|
||||||
continue
|
continue
|
||||||
entity = post.meta.get(self.METADATA_KEY)
|
entity, entity_ok = self._check_entity(post.meta, txn_entity)
|
||||||
if entity is None:
|
if entity is txn_entity and entity is not None:
|
||||||
yield errormod.InvalidMetadataError(txn, self.METADATA_KEY, entity, post)
|
|
||||||
elif entity is txn_entity:
|
|
||||||
pass
|
pass
|
||||||
elif not self.ENTITY_RE.match(entity):
|
elif not entity_ok:
|
||||||
yield errormod.InvalidMetadataError(txn, self.METADATA_KEY, entity, post)
|
yield errormod.InvalidMetadataError(txn, self.METADATA_KEY, entity, post)
|
||||||
|
|
|
@ -37,8 +37,14 @@ VALID_VALUES = {
|
||||||
'スミスダコタ',
|
'スミスダコタ',
|
||||||
'スミス-ダコタ',
|
'スミス-ダコタ',
|
||||||
'Яшин-Данила',
|
'Яшин-Данила',
|
||||||
# The PayPal importer produces . in entity metadata
|
# Governments, using : as a hierarchy separator
|
||||||
|
'BE',
|
||||||
|
'US:KY',
|
||||||
|
'CA:ON',
|
||||||
|
# The PayPal importer allows ASCII punctuation in entity metadata
|
||||||
'Du-Bois-W.-E.-B.',
|
'Du-Bois-W.-E.-B.',
|
||||||
|
"O'Malley-Thomas",
|
||||||
|
'O`Malley-Thomas',
|
||||||
# import2ledger produces entities that end with -
|
# import2ledger produces entities that end with -
|
||||||
# That's probably a bug, but allow it for now.
|
# That's probably a bug, but allow it for now.
|
||||||
'foo-',
|
'foo-',
|
||||||
|
@ -47,7 +53,6 @@ VALID_VALUES = {
|
||||||
INVALID_VALUES = {
|
INVALID_VALUES = {
|
||||||
# Starting with a - is not allowed
|
# Starting with a - is not allowed
|
||||||
'-foo',
|
'-foo',
|
||||||
'-',
|
|
||||||
# Names that can be reduced to ASCII should be
|
# Names that can be reduced to ASCII should be
|
||||||
# Producers should change this to Uberentity or Ueberentity
|
# Producers should change this to Uberentity or Ueberentity
|
||||||
# I am not wild about this rule and would like to relax it—it's mostly
|
# I am not wild about this rule and would like to relax it—it's mostly
|
||||||
|
@ -56,14 +61,25 @@ INVALID_VALUES = {
|
||||||
# mangling producers are expected to do. But it's the rule for today.
|
# mangling producers are expected to do. But it's the rule for today.
|
||||||
'Überentity',
|
'Überentity',
|
||||||
# Whitespace is never allowed
|
# Whitespace is never allowed
|
||||||
' ',
|
|
||||||
'Alex Smith',
|
'Alex Smith',
|
||||||
'田中\u00A0流星', # Non-breaking space
|
'田中\u00A0流星', # Non-breaking space
|
||||||
# The only punctuation allowed is - and .
|
# Non-ASCII punctuation is not allowed
|
||||||
'スミス_ダコタ',
|
|
||||||
'Яшин—Данила', # em dash
|
'Яшин—Данила', # em dash
|
||||||
# An empty string is not valid
|
'O’Malley-Thomas', # Right-angled apostrophe
|
||||||
|
'Du-Bois-W。-E。-B。', # Japanese period
|
||||||
|
}
|
||||||
|
|
||||||
|
ANONYMOUS_VALUES = {
|
||||||
|
# Values produced by various importers that should be translated to
|
||||||
|
# Anonymous.
|
||||||
'',
|
'',
|
||||||
|
' ',
|
||||||
|
'-',
|
||||||
|
'--',
|
||||||
|
'-----',
|
||||||
|
'_',
|
||||||
|
' _ ',
|
||||||
|
'.',
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_KEY = 'entity'
|
TEST_KEY = 'entity'
|
||||||
|
@ -81,6 +97,15 @@ def test_valid_values_on_postings(hook, src_value):
|
||||||
])
|
])
|
||||||
assert not any(hook.run(txn))
|
assert not any(hook.run(txn))
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('src_value', ANONYMOUS_VALUES)
|
||||||
|
def test_anonymous_values_on_postings(hook, src_value):
|
||||||
|
txn = testutil.Transaction(postings=[
|
||||||
|
('Assets:Cash', -25),
|
||||||
|
('Expenses:General', 25, {TEST_KEY: src_value}),
|
||||||
|
])
|
||||||
|
assert not any(hook.run(txn))
|
||||||
|
assert txn.postings[-1].meta[TEST_KEY] == 'Anonymous'
|
||||||
|
|
||||||
@pytest.mark.parametrize('src_value', INVALID_VALUES)
|
@pytest.mark.parametrize('src_value', INVALID_VALUES)
|
||||||
def test_invalid_values_on_postings(hook, src_value):
|
def test_invalid_values_on_postings(hook, src_value):
|
||||||
txn = testutil.Transaction(postings=[
|
txn = testutil.Transaction(postings=[
|
||||||
|
@ -99,6 +124,15 @@ def test_valid_values_on_transactions(hook, src_value):
|
||||||
])
|
])
|
||||||
assert not any(hook.run(txn))
|
assert not any(hook.run(txn))
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('src_value', ANONYMOUS_VALUES)
|
||||||
|
def test_anonymous_values_on_transactions(hook, src_value):
|
||||||
|
txn = testutil.Transaction(**{TEST_KEY: src_value}, postings=[
|
||||||
|
('Assets:Cash', -25),
|
||||||
|
('Expenses:General', 25),
|
||||||
|
])
|
||||||
|
assert not any(hook.run(txn))
|
||||||
|
assert txn.meta[TEST_KEY] == 'Anonymous'
|
||||||
|
|
||||||
@pytest.mark.parametrize('src_value', INVALID_VALUES)
|
@pytest.mark.parametrize('src_value', INVALID_VALUES)
|
||||||
def test_invalid_values_on_transactions(hook, src_value):
|
def test_invalid_values_on_transactions(hook, src_value):
|
||||||
txn = testutil.Transaction(**{TEST_KEY: src_value}, postings=[
|
txn = testutil.Transaction(**{TEST_KEY: src_value}, postings=[
|
||||||
|
@ -118,6 +152,15 @@ def test_valid_values_on_payee(hook, src_value):
|
||||||
])
|
])
|
||||||
assert not any(hook.run(txn))
|
assert not any(hook.run(txn))
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('src_value', ANONYMOUS_VALUES)
|
||||||
|
def test_anonymous_values_on_payee(hook, src_value):
|
||||||
|
txn = testutil.Transaction(payee=src_value, postings=[
|
||||||
|
('Assets:Cash', -25),
|
||||||
|
('Expenses:General', 25),
|
||||||
|
])
|
||||||
|
assert not any(hook.run(txn))
|
||||||
|
assert txn.meta[TEST_KEY] == 'Anonymous'
|
||||||
|
|
||||||
@pytest.mark.parametrize('src_value', INVALID_VALUES)
|
@pytest.mark.parametrize('src_value', INVALID_VALUES)
|
||||||
def test_invalid_values_on_payee(hook, src_value):
|
def test_invalid_values_on_payee(hook, src_value):
|
||||||
txn = testutil.Transaction(payee=src_value, postings=[
|
txn = testutil.Transaction(payee=src_value, postings=[
|
||||||
|
|
Loading…
Reference in a new issue