From a3e60c639f1de769973814d01a53aaea36771cf8 Mon Sep 17 00:00:00 2001 From: Ben Sturmfels Date: Mon, 21 Feb 2022 12:16:24 +1100 Subject: [PATCH] reconcile: Add special case for payee first word match. --- .../reconcile/prototype_amex_reconciler.py | 21 +++++++++++++++++-- tests/test_reconcile.py | 8 +++++++ 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/conservancy_beancount/reconcile/prototype_amex_reconciler.py b/conservancy_beancount/reconcile/prototype_amex_reconciler.py index 149c785..2d82576 100644 --- a/conservancy_beancount/reconcile/prototype_amex_reconciler.py +++ b/conservancy_beancount/reconcile/prototype_amex_reconciler.py @@ -89,6 +89,7 @@ JUNK_WORDS = [ 'online', 'donation', 'usd', + 'inc', ] JUNK_WORDS_RES = [re.compile(word, re.IGNORECASE) for word in JUNK_WORDS] ZERO_RE = re.compile('^0+') @@ -182,6 +183,21 @@ def sort_records(records: List) -> List: return sorted(records, key=lambda x: (x['date'], x['amount'])) +def first_word_exact_match(a, b): + if len(a) == 0 or len(b) == 0: + return 0 + first_a = a.split()[0].strip() + first_b = b.split()[0].strip() + if first_a.casefold() == first_b.casefold(): + return min(1.0, 0.2 * len(first_a)) + else: + return 0; + +def payee_match(a, b): + fuzzy_match = fuzz.token_set_ratio(a, b) / 100.00 + first_word_match = first_word_exact_match(a, b) + return max(fuzzy_match, first_word_match) + def records_match(r1: Dict, r2: Dict) -> Tuple[bool, str]: """Do these records represent the same transaction?""" @@ -204,7 +220,8 @@ def records_match(r1: Dict, r2: Dict) -> Tuple[bool, str]: else: check_score = 0.0 - payee_score = fuzz.token_set_ratio(r1['payee'], r2['payee']) / 100.00 + payee_score = payee_match(r1['payee'], r2['payee']) + if check_score == 1.0 or payee_score > 0.8: payee_message = '' elif payee_score > 0.4: @@ -387,7 +404,7 @@ def main(args): out = io.StringIO() print('-' * 155) - print(f'{"Statement transaction":<38} {"Books transaction":<44} Notes') + print(f'{"Statement transaction":<52} {"Books transaction":<58} Notes') print('-' * 155) for _, output in sorted(matches): print(output) diff --git a/tests/test_reconcile.py b/tests/test_reconcile.py index ea9fe8e..7c0ba5d 100644 --- a/tests/test_reconcile.py +++ b/tests/test_reconcile.py @@ -6,6 +6,7 @@ from conservancy_beancount.reconcile.prototype_amex_reconciler import ( remove_payee_junk, date_proximity, remove_duplicate_words, + payee_match, ) S1 = { @@ -178,3 +179,10 @@ def test_date_proximity(): def test_remove_duplicate_words(): assert remove_duplicate_words('Hi Foo Kow FOO') == 'Hi Foo Kow' + +def test_remove_duplicate_words(): + assert remove_duplicate_words('Hi Foo Kow FOO') == 'Hi Foo Kow' + +def test_payee_matches_when_first_word_matches(): + assert payee_match('Gandi San Francisco', 'Gandi example.com renewal 1234567') == 1.0 + assert payee_match('USPS 123456789 Portland', 'USPS John Brown') == 0.8