From 499f18ff623c954558d59648653a8e0708baa460 Mon Sep 17 00:00:00 2001 From: Brett Smith Date: Wed, 1 Apr 2020 13:38:37 -0400 Subject: [PATCH] meta_entity: Adjust what entities are allowed based on today's books. See the comments throughout for more discussion about what cases are or aren't allowed, and why. --- conservancy_beancount/plugin/meta_entity.py | 24 +++++++++++++++-- setup.py | 1 + tests/test_meta_entity.py | 30 ++++++++++++++++++++- 3 files changed, 52 insertions(+), 3 deletions(-) diff --git a/conservancy_beancount/plugin/meta_entity.py b/conservancy_beancount/plugin/meta_entity.py index 4096a65..ee655f4 100644 --- a/conservancy_beancount/plugin/meta_entity.py +++ b/conservancy_beancount/plugin/meta_entity.py @@ -14,7 +14,12 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . -import re +# Type stubs aren't available for regex. +# Fortunately, we're using it in a way that's API-compatible with the re +# module. We mitigate the lack of type stubs by providing type declarations +# for returned objects. This way, the only thing that isn't type checked are +# the calls to regex functions. +import regex # type:ignore[import] from . import core from .. import data @@ -23,10 +28,25 @@ from ..beancount_types import ( Transaction, ) +from typing import ( + Pattern, +) + class MetaEntity(core.TransactionHook): METADATA_KEY = 'entity' HOOK_GROUPS = frozenset(['posting', 'metadata', METADATA_KEY]) - ENTITY_RE = re.compile(r'^[A-Za-z0-9]+(?:-[A-Za-z0-9]+)*$') + + # alnum is the set of characters we always accept in entity metadata: + # letters and digits, minus the Latin 1 supplement (i.e., Roman letters + # with diacritics: áÁàÀâÂåÅäÄãà çÇ ðÐ ñÑ øØ ß etc.) + # See the tests for specific cases. + alnum = r'\p{Letter}\p{Digit}--\p{Block=Latin_1_Supplement}' + # A regexp that would be reasonably stricter would be: + # f'^[{alnum}][.{alnum}]*(?:-[.{alnum}])*$' + # However, current producers fail that regexp in a few different ways. + # See the tests for specific cases. + ENTITY_RE: Pattern[str] = regex.compile(f'^[{alnum}][-.{alnum}]*$', regex.VERSION1) + del alnum def run(self, txn: Transaction) -> errormod.Iter: txn_entity = txn.meta.get(self.METADATA_KEY) diff --git a/setup.py b/setup.py index c473325..565125c 100755 --- a/setup.py +++ b/setup.py @@ -12,6 +12,7 @@ setup( install_requires=[ 'beancount>=2.2', + 'regex', 'rt>=2.0', ], setup_requires=[ diff --git a/tests/test_meta_entity.py b/tests/test_meta_entity.py index 0b75595..f20e0ab 100644 --- a/tests/test_meta_entity.py +++ b/tests/test_meta_entity.py @@ -21,19 +21,47 @@ from . import testutil from conservancy_beancount.plugin import meta_entity VALID_VALUES = { + # Classic entity: LastName-FirstName 'Smith-Alex', + # Various people and companies have one-word names + # Digits are allowed, as part of a name or standalone 'Company19', + 'Company-19', + # No case requirements 'boyd-danah', + # No limit on the number of parts of the name 'B-van-der-A', + # Names that have no ASCII are allowed, with or without dash separators + '田中流星', + '田中-流星', + 'スミスダコタ', + 'スミス-ダコタ', + 'Яшин-Данила', + # The PayPal importer produces . in entity metadata + 'Du-Bois-W.-E.-B.', + # import2ledger produces entities that end with - + # That's probably a bug, but allow it for now. + 'foo-', } INVALID_VALUES = { + # Starting with a - is not allowed '-foo', - 'foo-', '-', + # Names that can be reduced to ASCII should be + # Producers should change this to Uberentity or Ueberentity + # I am not wild about this rule and would like to relax it—it's mostly + # based on an expectation that entities are typed in by a human. That's true + # less and less and it seems like we should reduce the amount of mangling + # producers are expected to do. But it's the rule for today. 'Überentity', + # Whitespace is never allowed 'Alex Smith', + '田中 流星', + 'スミス ダコタ', + 'Яшин Данила', ' ', + # An empty string is not valid '', }