reconcile: Add special case for payee first word match.

This commit is contained in:
Ben Sturmfels 2022-02-21 12:16:24 +11:00
parent 32fc4517a0
commit a3e60c639f
Signed by: bsturmfels
GPG key ID: 023C05E2C9C068F0
2 changed files with 27 additions and 2 deletions

View file

@ -89,6 +89,7 @@ JUNK_WORDS = [
'online', 'online',
'donation', 'donation',
'usd', 'usd',
'inc',
] ]
JUNK_WORDS_RES = [re.compile(word, re.IGNORECASE) for word in JUNK_WORDS] JUNK_WORDS_RES = [re.compile(word, re.IGNORECASE) for word in JUNK_WORDS]
ZERO_RE = re.compile('^0+') ZERO_RE = re.compile('^0+')
@ -182,6 +183,21 @@ def sort_records(records: List) -> List:
return sorted(records, key=lambda x: (x['date'], x['amount'])) return sorted(records, key=lambda x: (x['date'], x['amount']))
def first_word_exact_match(a, b):
if len(a) == 0 or len(b) == 0:
return 0
first_a = a.split()[0].strip()
first_b = b.split()[0].strip()
if first_a.casefold() == first_b.casefold():
return min(1.0, 0.2 * len(first_a))
else:
return 0;
def payee_match(a, b):
fuzzy_match = fuzz.token_set_ratio(a, b) / 100.00
first_word_match = first_word_exact_match(a, b)
return max(fuzzy_match, first_word_match)
def records_match(r1: Dict, r2: Dict) -> Tuple[bool, str]: def records_match(r1: Dict, r2: Dict) -> Tuple[bool, str]:
"""Do these records represent the same transaction?""" """Do these records represent the same transaction?"""
@ -204,7 +220,8 @@ def records_match(r1: Dict, r2: Dict) -> Tuple[bool, str]:
else: else:
check_score = 0.0 check_score = 0.0
payee_score = fuzz.token_set_ratio(r1['payee'], r2['payee']) / 100.00 payee_score = payee_match(r1['payee'], r2['payee'])
if check_score == 1.0 or payee_score > 0.8: if check_score == 1.0 or payee_score > 0.8:
payee_message = '' payee_message = ''
elif payee_score > 0.4: elif payee_score > 0.4:
@ -387,7 +404,7 @@ def main(args):
out = io.StringIO() out = io.StringIO()
print('-' * 155) print('-' * 155)
print(f'{"Statement transaction":<38} {"Books transaction":<44} Notes') print(f'{"Statement transaction":<52} {"Books transaction":<58} Notes')
print('-' * 155) print('-' * 155)
for _, output in sorted(matches): for _, output in sorted(matches):
print(output) print(output)

View file

@ -6,6 +6,7 @@ from conservancy_beancount.reconcile.prototype_amex_reconciler import (
remove_payee_junk, remove_payee_junk,
date_proximity, date_proximity,
remove_duplicate_words, remove_duplicate_words,
payee_match,
) )
S1 = { S1 = {
@ -178,3 +179,10 @@ def test_date_proximity():
def test_remove_duplicate_words(): def test_remove_duplicate_words():
assert remove_duplicate_words('Hi Foo Kow FOO') == 'Hi Foo Kow' assert remove_duplicate_words('Hi Foo Kow FOO') == 'Hi Foo Kow'
def test_remove_duplicate_words():
assert remove_duplicate_words('Hi Foo Kow FOO') == 'Hi Foo Kow'
def test_payee_matches_when_first_word_matches():
assert payee_match('Gandi San Francisco', 'Gandi example.com renewal 1234567') == 1.0
assert payee_match('USPS 123456789 Portland', 'USPS John Brown') == 0.8