From 405dd553cb0006f3dd9f54f510df0bc0a98a7b00 Mon Sep 17 00:00:00 2001 From: Ben Sturmfels Date: Wed, 23 Feb 2022 17:24:31 +1100 Subject: [PATCH] reconcile: Implement "subset sum" feature matching multiple books entries to a single statement entry. --- .../reconcile/statement_reconciler.py | 111 ++++++++++++---- tests/test_reconcile.py | 125 +++++++++++++----- 2 files changed, 171 insertions(+), 65 deletions(-) diff --git a/conservancy_beancount/reconcile/statement_reconciler.py b/conservancy_beancount/reconcile/statement_reconciler.py index 0cca0d9..897c369 100644 --- a/conservancy_beancount/reconcile/statement_reconciler.py +++ b/conservancy_beancount/reconcile/statement_reconciler.py @@ -49,6 +49,7 @@ import csv import datetime import decimal import io +import itertools import logging import os import re @@ -57,6 +58,7 @@ from typing import Callable, Dict, List, Tuple, TextIO from beancount import loader from beancount.query.query import run_query +from colorama import Fore, Style if not sys.warnoptions: import warnings @@ -164,21 +166,26 @@ def standardize_beancount_record(row) -> Dict: # type: ignore[no-untyped-def] } -def format_record(records: list[dict]) -> str: - if len(records) == 1: - record = records[0] - - if record['payee'] and record['check_id']: - output = f"{record['date'].isoformat()}: {record['amount']:11,.2f} {record['payee'][:25]} #{record['check_id']}".ljust(59) - elif record['payee']: - output = f"{record['date'].isoformat()}: {record['amount']:11,.2f} {record['payee'][:35]}".ljust(59) - else: - output = f"{record['date'].isoformat()}: {record['amount']:11,.2f} #{record['check_id']}".ljust(59) - return output +def format_record(record: dict) -> str: + if record['payee'] and record['check_id']: + output = f"{record['date'].isoformat()}: {record['amount']:11,.2f} {record['payee'][:25]} #{record['check_id']}".ljust(59) + elif record['payee']: + output = f"{record['date'].isoformat()}: {record['amount']:11,.2f} {record['payee'][:35]}".ljust(59) else: - raise NotImplementedError + output = f"{record['date'].isoformat()}: {record['amount']:11,.2f} #{record['check_id']}".ljust(59) + return output +def format_multirecord(r1s, r2s, note): + total = sum(x['amount'] for x in r2s) + assert len(r1s) == 1 + assert len(r2s) > 1 + match_output = [] + match_output.append([r1s[0]['date'].isoformat() + ' ' + r1s[0]['payee'], f'{format_record(r1s[0])} → {format_record(r2s[0])} ✓ Matched{note}']) + for i, r2 in enumerate(r2s[1:]): + match_output.append([r1s[0]['date'].isoformat() + str(i) + r1s[0]['payee'], f'{r1s[0]["date"].isoformat()}: ↳ → {format_record(r2)} ✓ Matched{note}']) + return match_output + def sort_records(records: List) -> List: return sorted(records, key=lambda x: (x['date'], x['amount'])) @@ -253,13 +260,7 @@ def match_statement_and_books(statement_trans: list, books_trans: list): """ matches = [] - # We need a realised list and should be a copy so we can safely delete - # items. - books_trans = list(books_trans) - - # We can delete the matched books trans, but seems not a good idea to delete - # while iterating through statement_trans. Instead pushing onto a separate - # list. + remaining_books_trans = [] remaining_statement_trans = [] for r1 in statement_trans: @@ -278,12 +279,13 @@ def match_statement_and_books(statement_trans: list, books_trans: list): if best_match_score <= 0.8: best_match_note.append('only one decent match') matches.append(([r1], [books_trans[best_match_index]], best_match_note)) + # Don't try to make a second match against this books entry. del books_trans[best_match_index] else: - matches.append(([r1], [], ['no match'])) + remaining_statement_trans.append(r1) for r2 in books_trans: - matches.append(([], [r2], ['no match'])) - return matches + remaining_books_trans.append(r2) + return matches, remaining_statement_trans, remaining_books_trans def format_matches(matches, csv_statement: str, show_reconciled_matches): @@ -292,12 +294,15 @@ def format_matches(matches, csv_statement: str, show_reconciled_matches): note = ', '.join(note) note = ': ' + note if note else note if r1 and r2: - if show_reconciled_matches: - match_output.append([r1[0]['date'], f'{format_record(r1)} → {format_record(r2)} ✓ Matched{note}']) + if show_reconciled_matches and all(x['bank_statement'] for x in r2): + if len(r2) == 1: + match_output.append([r1[0]['date'].isoformat() + r1[0]['payee'], f'{format_record(r1[0])} → {format_record(r2[0])} ✓ Matched{note}']) + else: + match_output.extend(format_multirecord(r1, r2, note)) elif r1: - match_output.append([r1[0]['date'], f'{format_record(r1)} → {" ":^59} ✗ NOT IN BOOKS ({os.path.basename(csv_statement)}:{r1[0]["line"]})']) + match_output.append([r1[0]['date'].isoformat() + r1[0]['payee'], Fore.RED + Style.BRIGHT + f'{format_record(r1[0])} → {" ":^59} ✗ NOT IN BOOKS ({os.path.basename(csv_statement)}:{r1[0]["line"]})' + Style.RESET_ALL]) else: - match_output.append([r2[0]['date'], f'{" ":^59} → {format_record(r2)} ✗ NOT ON STATEMENT ({os.path.basename(r2[0]["filename"])}:{r2[0]["line"]})']) + match_output.append([r2[0]['date'].isoformat() + r2[0]['payee'], Fore.RED + Style.BRIGHT + f'{" ":^59} → {format_record(r2[0])} ✗ NOT ON STATEMENT ({os.path.basename(r2[0]["filename"])}:{r2[0]["line"]})' + Style.RESET_ALL]) return match_output @@ -400,6 +405,51 @@ def totals(matches): total_missing_from_statement += sum(c['amount'] for c in books_entries) return total_matched, total_missing_from_books, total_missing_from_statement + +def subset_match(statement_trans, books_trans): + matches = [] + remaining_books_trans = [] + remaining_statement_trans = [] + + groups = itertools.groupby(books_trans, key=lambda x: (x['date'], x['payee'])) + for k, group in groups: + best_match_score = 0 + best_match_index = None + best_match_note = '' + matches_found = 0 + + group_items = list(group) + total = sum(x['amount'] for x in group_items) + r2 = copy.copy(group_items[0]) + r2['amount'] = total + for i, r1 in enumerate(statement_trans): + score, note = records_match(r1, r2) + if score >= 0.5 and score >= best_match_score: + matches_found += 1 + best_match_score = score + best_match_index = i + best_match_note = note + if best_match_score > 0.5 and matches_found == 1 and 'check-id mismatch' not in best_match_note or best_match_score > 0.8: + if best_match_score <= 0.8: + best_match_note.append('only one decent match') + matches.append(([statement_trans[best_match_index]], group_items, best_match_note)) + del statement_trans[best_match_index] + for item in group_items: + books_trans.remove(item) + else: + remaining_books_trans.append(r2) + for r1 in statement_trans: + remaining_statement_trans.append(r1) + return matches, remaining_statement_trans, remaining_books_trans + +def process_unmatched(statement_trans, books_trans): + matches = [] + for r1 in statement_trans: + matches.append(([r1], [], ['no match'])) + for r2 in books_trans: + matches.append(([], [r2], ['no match'])) + return matches + def main(args): # TODO: Should put in a sanity check to make sure the statement you're feeding # in matches the account you've provided. @@ -443,10 +493,13 @@ def main(args): books_trans = sort_records([standardize_beancount_record(row) for row in result_rows]) - matches = match_statement_and_books(statement_trans, books_trans) - match_output = format_matches(matches, args.csv_statement, args.show_reconciled_matches) + matches, remaining_statement_trans, remaining_books_trans = match_statement_and_books(statement_trans, books_trans) + subset_matches, remaining_statement_trans, remaining_books_trans = subset_match(remaining_statement_trans, remaining_books_trans) + matches.extend(subset_matches) + unmatched = process_unmatched(remaining_statement_trans, remaining_books_trans) + matches.extend(unmatched) - # assert books_balance == books_balance_reconciled + total_matched + total_missing_from_statement + match_output = format_matches(matches, args.csv_statement, args.show_reconciled_matches) total_matched, total_missing_from_books, total_missing_from_statement = totals(matches) diff --git a/tests/test_reconcile.py b/tests/test_reconcile.py index 7703921..fc00ef5 100644 --- a/tests/test_reconcile.py +++ b/tests/test_reconcile.py @@ -13,6 +13,7 @@ from conservancy_beancount.reconcile.statement_reconciler import ( metadata_for_match, write_metadata_to_books, totals, + subset_match, ) # These data structures represent individual transactions as taken from the @@ -40,6 +41,13 @@ S3 = { 'check_id': '', 'line': 444, } +S4 = { + 'date': datetime.date(2022, 8, 11), + 'amount': decimal.Decimal('-2260.00'), + 'payee': 'Trust 0000000362 210', + 'check_id': '', + 'line': 555, +} # Books transaction examples. B1 = { @@ -114,81 +122,116 @@ B3_unmatched_check_id = { 'line': 999, 'bank_statement': "Financial/Bank-Statements/AMEX/2022-01-12_AMEX_statement.pdf" } +B4A = { + 'date': datetime.date(2022, 8, 11), + 'amount': decimal.Decimal('-250.00'), + 'payee': 'TRUST 0000000362 ACH Retirement Plan', + 'check_id': '', + 'line': 1000, +} +B4B = { + 'date': datetime.date(2022, 8, 11), + 'amount': decimal.Decimal('-250.00'), + 'payee': 'TRUST 0000000362 ACH Retirement Plan', + 'check_id': '', + 'line': 1000, +} +B4C = { + 'date': datetime.date(2022, 8, 11), + 'amount': decimal.Decimal('-1760.00'), + 'payee': 'TRUST 0000000362 ACH Retirement Plan', + 'check_id': '', + 'line': 1000, +} + def test_one_exact_match(): statement = [S1] books = [B1] - assert match_statement_and_books(statement, books) == [ + assert match_statement_and_books(statement, books) == ( # Match, match, notes. # # The matches are a list so we can implement subset-sum matching where # multiple books transactions may match to a single statement # transaction. - ([S1], [B1], []), - ] + [([S1], [B1], [])], + [], + [], + ) def test_multiple_exact_matches(): statement = [S1, S2] books = [B1, B2] - assert match_statement_and_books(statement, books) == [ - ([S1], [B1], []), - ([S2], [B2], []), - ] + assert match_statement_and_books(statement, books) == ( + [([S1], [B1], []), ([S2], [B2], [])], + [], + [], + ) def test_one_mismatch(): statement = [S1] books = [] - assert match_statement_and_books(statement, books) == [ - ([S1], [], ['no match']), - ] + assert match_statement_and_books(statement, books) == ( + [], + [S1], + [], + ) def test_multiple_mismatches(): statement = [S1] books = [B2] - assert match_statement_and_books(statement, books) == [ - ([S1], [], ['no match']), - ([], [B2], ['no match']), - ] + assert match_statement_and_books(statement, books) == ( + [], + [S1], + [B2], + ) def test_next_day_matches(): statement = [S3] books = [B3_next_day] - assert match_statement_and_books(statement, books) == [ - ([S3], [B3_next_day], ['+/- 1 days']), - ] + assert match_statement_and_books(statement, books) == ( + [([S3], [B3_next_day], ['+/- 1 days'])], + [], + [], + ) def test_next_week_matches(): statement = [S3] books = [B3_next_week] - assert match_statement_and_books(statement, books) == [ - ([S3], [B3_next_week], ['+/- 7 days']), - ] + assert match_statement_and_books(statement, books) == ( + [([S3], [B3_next_week], ['+/- 7 days'])], + [], + [], + ) def test_incorrect_amount_does_not_match(): statement = [S3] books = [B3_mismatch_amount] - assert match_statement_and_books(statement, books) == [ - ([S3], [], ['no match']), - ([], [B3_mismatch_amount], ['no match']), - ] + assert match_statement_and_books(statement, books) == ( + [], + [S3], + [B3_mismatch_amount], + ) def test_payee_mismatch_ok_when_only_one_that_amount_and_date(): statement = [S3] books = [B3_payee_mismatch_1] - assert match_statement_and_books(statement, books) == [ - ([S3], [B3_payee_mismatch_1], ['payee mismatch', 'only one decent match']), - ] + assert match_statement_and_books(statement, books) == ( + [([S3], [B3_payee_mismatch_1], ['payee mismatch', 'only one decent match'])], + [], + [], + ) def test_payee_mismatch_not_ok_when_multiple_that_amount_and_date(): statement = [S3] books = [B3_payee_mismatch_1, B3_payee_mismatch_2] match = match_statement_and_books(statement, books) - assert match == [ - ([S3], [], ['no match']), - ([], [B3_payee_mismatch_1], ['no match']), - ([], [B3_payee_mismatch_2], ['no match']), - ] + assert match == ( + [], + [S3], + [B3_payee_mismatch_1, B3_payee_mismatch_2], + ) def test_remove_payee_junk(): assert remove_payee_junk('WIDGETSRUS INC PAYMENT 1') == 'WIDGETSRUS' @@ -251,7 +294,17 @@ def test_payee_not_considered_if_check_id_present(): # These records match aside from check-id. statement = [S3] books = [B3_unmatched_check_id] - assert match_statement_and_books(statement, books) == [ - ([S3], [], ['no match']), - ([], [B3_unmatched_check_id], ['no match']), - ] + assert match_statement_and_books(statement, books) == ( + [], + [S3], + [B3_unmatched_check_id], + ) + +def test_subset_sum_match(): + statement = [S4] + books = [B4A, B4B, B4C] + assert subset_match(statement, books) == ( + [([S4], [B4A, B4B, B4C], [])], + [], # No remaining statement trans. + [], # No remaining books trans. + )