meta_entity: Adjust what entities are allowed based on today's books.
See the comments throughout for more discussion about what cases are or aren't allowed, and why.
This commit is contained in:
		
							parent
							
								
									90a58ef112
								
							
						
					
					
						commit
						499f18ff62
					
				
					 3 changed files with 52 additions and 3 deletions
				
			
		| 
						 | 
					@ -14,7 +14,12 @@
 | 
				
			||||||
# You should have received a copy of the GNU Affero General Public License
 | 
					# You should have received a copy of the GNU Affero General Public License
 | 
				
			||||||
# along with this program.  If not, see <https://www.gnu.org/licenses/>.
 | 
					# along with this program.  If not, see <https://www.gnu.org/licenses/>.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import re
 | 
					# Type stubs aren't available for regex.
 | 
				
			||||||
 | 
					# Fortunately, we're using it in a way that's API-compatible with the re
 | 
				
			||||||
 | 
					# module. We mitigate the lack of type stubs by providing type declarations
 | 
				
			||||||
 | 
					# for returned objects. This way, the only thing that isn't type checked are
 | 
				
			||||||
 | 
					# the calls to regex functions.
 | 
				
			||||||
 | 
					import regex  # type:ignore[import]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from . import core
 | 
					from . import core
 | 
				
			||||||
from .. import data
 | 
					from .. import data
 | 
				
			||||||
| 
						 | 
					@ -23,10 +28,25 @@ from ..beancount_types import (
 | 
				
			||||||
    Transaction,
 | 
					    Transaction,
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from typing import (
 | 
				
			||||||
 | 
					    Pattern,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class MetaEntity(core.TransactionHook):
 | 
					class MetaEntity(core.TransactionHook):
 | 
				
			||||||
    METADATA_KEY = 'entity'
 | 
					    METADATA_KEY = 'entity'
 | 
				
			||||||
    HOOK_GROUPS = frozenset(['posting', 'metadata', METADATA_KEY])
 | 
					    HOOK_GROUPS = frozenset(['posting', 'metadata', METADATA_KEY])
 | 
				
			||||||
    ENTITY_RE = re.compile(r'^[A-Za-z0-9]+(?:-[A-Za-z0-9]+)*$')
 | 
					
 | 
				
			||||||
 | 
					    # alnum is the set of characters we always accept in entity metadata:
 | 
				
			||||||
 | 
					    # letters and digits, minus the Latin 1 supplement (i.e., Roman letters
 | 
				
			||||||
 | 
					    # with diacritics: áÁàÀâÂåÅäÄãà çÇ ðÐ ñÑ øØ ß etc.)
 | 
				
			||||||
 | 
					    # See the tests for specific cases.
 | 
				
			||||||
 | 
					    alnum = r'\p{Letter}\p{Digit}--\p{Block=Latin_1_Supplement}'
 | 
				
			||||||
 | 
					    # A regexp that would be reasonably stricter would be:
 | 
				
			||||||
 | 
					    #   f'^[{alnum}][.{alnum}]*(?:-[.{alnum}])*$'
 | 
				
			||||||
 | 
					    # However, current producers fail that regexp in a few different ways.
 | 
				
			||||||
 | 
					    # See the tests for specific cases.
 | 
				
			||||||
 | 
					    ENTITY_RE: Pattern[str] = regex.compile(f'^[{alnum}][-.{alnum}]*$', regex.VERSION1)
 | 
				
			||||||
 | 
					    del alnum
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def run(self, txn: Transaction) -> errormod.Iter:
 | 
					    def run(self, txn: Transaction) -> errormod.Iter:
 | 
				
			||||||
        txn_entity = txn.meta.get(self.METADATA_KEY)
 | 
					        txn_entity = txn.meta.get(self.METADATA_KEY)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										1
									
								
								setup.py
									
										
									
									
									
								
							
							
						
						
									
										1
									
								
								setup.py
									
										
									
									
									
								
							| 
						 | 
					@ -12,6 +12,7 @@ setup(
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    install_requires=[
 | 
					    install_requires=[
 | 
				
			||||||
        'beancount>=2.2',
 | 
					        'beancount>=2.2',
 | 
				
			||||||
 | 
					        'regex',
 | 
				
			||||||
        'rt>=2.0',
 | 
					        'rt>=2.0',
 | 
				
			||||||
    ],
 | 
					    ],
 | 
				
			||||||
    setup_requires=[
 | 
					    setup_requires=[
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -21,19 +21,47 @@ from . import testutil
 | 
				
			||||||
from conservancy_beancount.plugin import meta_entity
 | 
					from conservancy_beancount.plugin import meta_entity
 | 
				
			||||||
 | 
					
 | 
				
			||||||
VALID_VALUES = {
 | 
					VALID_VALUES = {
 | 
				
			||||||
 | 
					    # Classic entity: LastName-FirstName
 | 
				
			||||||
    'Smith-Alex',
 | 
					    'Smith-Alex',
 | 
				
			||||||
 | 
					    # Various people and companies have one-word names
 | 
				
			||||||
 | 
					    # Digits are allowed, as part of a name or standalone
 | 
				
			||||||
    'Company19',
 | 
					    'Company19',
 | 
				
			||||||
 | 
					    'Company-19',
 | 
				
			||||||
 | 
					    # No case requirements
 | 
				
			||||||
    'boyd-danah',
 | 
					    'boyd-danah',
 | 
				
			||||||
 | 
					    # No limit on the number of parts of the name
 | 
				
			||||||
    'B-van-der-A',
 | 
					    'B-van-der-A',
 | 
				
			||||||
 | 
					    # Names that have no ASCII are allowed, with or without dash separators
 | 
				
			||||||
 | 
					    '田中流星',
 | 
				
			||||||
 | 
					    '田中-流星',
 | 
				
			||||||
 | 
					    'スミスダコタ',
 | 
				
			||||||
 | 
					    'スミス-ダコタ',
 | 
				
			||||||
 | 
					    'Яшин-Данила',
 | 
				
			||||||
 | 
					    # The PayPal importer produces . in entity metadata
 | 
				
			||||||
 | 
					    'Du-Bois-W.-E.-B.',
 | 
				
			||||||
 | 
					    # import2ledger produces entities that end with -
 | 
				
			||||||
 | 
					    # That's probably a bug, but allow it for now.
 | 
				
			||||||
 | 
					    'foo-',
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
INVALID_VALUES = {
 | 
					INVALID_VALUES = {
 | 
				
			||||||
 | 
					    # Starting with a - is not allowed
 | 
				
			||||||
    '-foo',
 | 
					    '-foo',
 | 
				
			||||||
    'foo-',
 | 
					 | 
				
			||||||
    '-',
 | 
					    '-',
 | 
				
			||||||
 | 
					    # Names that can be reduced to ASCII should be
 | 
				
			||||||
 | 
					    # Producers should change this to Uberentity or Ueberentity
 | 
				
			||||||
 | 
					    # I am not wild about this rule and would like to relax it—it's mostly
 | 
				
			||||||
 | 
					    # based on an expectation that entities are typed in by a human. That's true
 | 
				
			||||||
 | 
					    # less and less and it seems like we should reduce the amount of mangling
 | 
				
			||||||
 | 
					    # producers are expected to do. But it's the rule for today.
 | 
				
			||||||
    'Überentity',
 | 
					    'Überentity',
 | 
				
			||||||
 | 
					    # Whitespace is never allowed
 | 
				
			||||||
    'Alex Smith',
 | 
					    'Alex Smith',
 | 
				
			||||||
 | 
					    '田中 流星',
 | 
				
			||||||
 | 
					    'スミス ダコタ',
 | 
				
			||||||
 | 
					    'Яшин Данила',
 | 
				
			||||||
    ' ',
 | 
					    ' ',
 | 
				
			||||||
 | 
					    # An empty string is not valid
 | 
				
			||||||
    '',
 | 
					    '',
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
	Add table
		
		Reference in a new issue