From d8f4eac53bb7dc66c16d780156cbd785fa9333e9 Mon Sep 17 00:00:00 2001 From: Ben Sturmfels Date: Fri, 18 Feb 2022 20:36:11 +1100 Subject: [PATCH] reconcile: Improve reconciler matching and add test cases. --- .../reconcile/prototype_amex_reconciler.py | 220 +++++++++++++----- tests/test_reconcile.py | 175 ++++++++++++++ 2 files changed, 337 insertions(+), 58 deletions(-) create mode 100644 tests/test_reconcile.py diff --git a/conservancy_beancount/reconcile/prototype_amex_reconciler.py b/conservancy_beancount/reconcile/prototype_amex_reconciler.py index 5a9b012..487d068 100644 --- a/conservancy_beancount/reconcile/prototype_amex_reconciler.py +++ b/conservancy_beancount/reconcile/prototype_amex_reconciler.py @@ -44,11 +44,14 @@ TODO/ISSUES: """ import argparse import collections +import copy import csv import datetime import decimal import io +import logging import os +import re import sys from typing import Callable, Dict, List, Tuple, TextIO @@ -62,6 +65,49 @@ if not sys.warnoptions: warnings.filterwarnings('ignore', category=UserWarning, module='thefuzz.fuzz') from thefuzz import fuzz # type: ignore +logger = logging.getLogger() +logger.setLevel(logging.DEBUG) + +# Console logging. +logger.addHandler(logging.StreamHandler()) + + +JUNK_WORDS = [ + 'software', + 'freedom', + 'conservancy', + 'conse', + 'payment', + 'echeck', + 'bill', + 'debit', + 'wire', + 'credit', + "int'l", + "in.l", + 'llc', + 'online', + 'donation', +] +JUNK_WORDS_RES = [re.compile(word, re.IGNORECASE) for word in JUNK_WORDS] +ZERO_RE = re.compile('^0+') + +def remove_payee_junk(payee): + for r in JUNK_WORDS_RES: + payee = r.sub('', payee) + payee = ZERO_RE.sub('', payee) + payee = payee.replace(' - ', ' ') + payee = re.sub(r'\.0\.\d+', ' ', payee) + payee = payee.replace('.0', ' ') + payee = payee.replace('/', ' ') + payee = re.sub(re.escape('.com'), ' ', payee, flags=re.IGNORECASE) + payee = re.sub(re.escape('.net'), ' ', payee, flags=re.IGNORECASE) + payee = payee.replace('*', ' ') + payee = ' '.join([i for i in payee.split(' ') if len(i) > 3]) + payee = payee.replace('-', ' ') + payee.strip() + return payee + # NOTE: Statement doesn't seem to give us a running balance or a final total. def read_transactions_from_csv(f: TextIO, standardize_statement_record: Callable) -> list: @@ -74,8 +120,10 @@ def standardize_amex_record(row: Dict, line: int) -> Dict: return { 'date': datetime.datetime.strptime(row['Date'], '%m/%d/%Y').date(), 'amount': -1 * decimal.Decimal(row['Amount']), + # Descriptions have too much noise, so taking just the start + # significantly assists the fuzzy matching. + 'payee': remove_payee_junk(row['Description'] or '')[:25], 'line': line, - 'payee': row['Description'] or '', } @@ -83,39 +131,46 @@ def standardize_fr_record(row: Dict, line: int) -> Dict: return { 'date': datetime.datetime.strptime(row['Date'], '%m/%d/%Y').date(), 'amount': decimal.Decimal(row['Amount']), - 'payee': row['Detail'] or row['Description'], + 'payee': remove_payee_junk(row['Detail'] or row['Serial Num'])[:20], 'line': line, } def standardize_beancount_record(row) -> Dict: # type: ignore[no-untyped-def] """Turn a Beancount query result row into a standard dict representing a transaction.""" + # if '0000000362' in row.narration: + # import pdb; pdb.set_trace() return { 'date': row.date, 'amount': row.number_cost_position, - 'payee': row.payee if row.payee else row.narration, + 'payee': remove_payee_junk(row.payee or row.narration)[:30], + 'entity': row.entity, + 'check_id': row.check_id, 'filename': row.filename, - 'line': row.posting_line, - 'statement': row.posting_statement, + 'line': row.line, + 'bank_statement': row.bank_statement, } -def format_record(record: Dict) -> str: - return f"{record['date'].isoformat()}: {record['amount']:12,.2f} {record['payee'][:20]:<20}" +def format_record(records: list[dict]) -> str: + if len(records) == 1: + record = records[0] + return f"{record['date'].isoformat()}: {record['amount']:12,.2f} {record['payee'][:25]:<25}" + else: + raise NotImplementedError def sort_records(records: List) -> List: return sorted(records, key=lambda x: (x['date'], x['amount'])) -def match_statement_and_books(statement_trans: list, books_trans: list, show_reconciled_matches: bool, csv_statement: str) -> tuple[list, list, decimal.Decimal]: +def match_statement_and_books2(statement_trans: list, books_trans: list): matches = [] - metadata_to_apply = [] - total_matched = decimal.Decimal(0) - total_missing_from_books = decimal.Decimal(0) - total_missing_from_statement = decimal.Decimal(0) + # We need a realised list and should be a copy so we can safely delete + # items. + books_trans = list(books_trans) - # Run through all the statement transactions to find a matching transaction in + # run through all the statement transactions to find a matching transaction in # the books. If found, the books transaction is marked off so that it can only # be matched once. Some transactions will be matched, some will be on the # statement but not the books and some on the books but not the statement. @@ -125,49 +180,96 @@ def match_statement_and_books(statement_trans: list, books_trans: list, show_rec # the books transactions to find an closely matching payee, then do another pass # disregarding payee. - # TODO: What if th for r1 in statement_trans: - for r2 in books_trans: - match, note = records_match(r1, r2) - if match: - if not r2['statement'] or show_reconciled_matches: - matches.append([r2['date'], f'{format_record(r1)} → {format_record(r2)} ✓ {note}']) - total_matched += r2['amount'] - if not r2['statement']: - metadata_to_apply.append((r2['filename'], r2['line'], f' bank-statement: "{os.path.basename(csv_statement)}:{r2["line"]}"\n')) - books_trans.remove(r2) - break + best_match_score = 0 + best_match_index = None + best_match_note = '' + matches_found = 0 + for i, r2 in enumerate(books_trans): + score, note = records_match2(r1, r2) + if score >= 0.5 and score >= best_match_score: + matches_found += 1 + best_match_score = score + best_match_index = i + best_match_note = note + # if note == 'payee mismatch': + # payee_only_mismatches += 1 + if best_match_score > 0.5 and matches_found == 1 and 'payee_mismatch' not in best_match_note or best_match_score > 0.8: + if best_match_score <= 0.8: + best_match_note.append('only one decent match') + matches.append(([r1], [books_trans[best_match_index]], best_match_note)) + del books_trans[best_match_index] else: - matches.append([r1['date'], f'{format_record(r1)} → {" ":^45} ✗ Not in books ({os.path.basename(csv_statement)}:{r1["line"]})']) - total_missing_from_books += r1['amount'] + matches.append(([r1], [], ['no match'])) for r2 in books_trans: - matches.append([r2['date'], f'{" ":^45} → {format_record(r2)} ✗ Not on statement ({os.path.basename(r2["filename"])}:{r2["line"]})']) - total_missing_from_statement += r2['amount'] - return matches, metadata_to_apply, total_matched, total_missing_from_books, total_missing_from_statement + matches.append(([], [r2], ['no match'])) + return matches + +def format_matches(statement_trans, books_trans, show_reconciled_matches: bool, csv_statement: str): + match_output = [] + metadata_to_apply = [] + total_matched = decimal.Decimal(0) + total_missing_from_books = decimal.Decimal(0) + total_missing_from_statement = decimal.Decimal(0) + + matches = match_statement_and_books2(statement_trans, books_trans) + for r1, r2, note in matches: + note = ', '.join(note) + note = ': ' + note if note else note + if r1 and r2: + match_output.append([r1[0]['date'], f'{format_record(r1)} → {format_record(r2)} ✓ Matched{note}']) + elif r1: + match_output.append([r1[0]['date'], f'{format_record(r1)} → {" ":^50} ✗ Not in books ({os.path.basename(csv_statement)}:{r1[0]["line"]})']) + else: + match_output.append([r2[0]['date'], f'{" ":^50} → {format_record(r2)} ✗ Not on statement ({os.path.basename(r2[0]["filename"])}:{r2[0]["line"]})']) + return match_output, metadata_to_apply, total_matched, total_missing_from_books, total_missing_from_statement -# TODO: Time for some test cases I think. # TODO: Could potentially return a score so that we can find the best match from # a pool of candidates. How would be then remove that candidate from the global # pool? -def records_match(r1: Dict, r2: Dict) -> Tuple[bool, str]: - """Do these records represent the same transaction?""" - date_matches_exactly = r1['date'] == r2['date'] - date_matches_loosly = r1['date'] >= r2['date'] - datetime.timedelta(days=3) and r1['date'] <= r2['date'] + datetime.timedelta(days=3) - amount_matches = r1['amount'] == r2['amount'] - payee_match_quality = fuzz.token_set_ratio(r1['payee'], r2['payee']) - payee_matches = payee_match_quality > 50 - if date_matches_exactly and amount_matches and payee_matches: - return True, 'Matched' - elif date_matches_loosly and amount_matches and payee_matches: - return True, 'Matched +/- 3 days' - elif date_matches_exactly and amount_matches: - return True, f'Matched ignoring payee' - elif date_matches_loosly and amount_matches: - return True, f'Matched +/- 3 days, ignoring payee' +def date_proximity(d1, d2): + diff = abs((d1 - d2).days) + if diff > 60: + return 0 else: - return False, '' + return 1.0 - (diff / 60.0) + +def records_match2(r1: Dict, r2: Dict) -> Tuple[bool, str]: + """Do these records represent the same transaction?""" + + date_score = date_proximity(r1['date'], r2['date']) + if r1['date'] == r2['date']: + date_message = '' + elif date_score > 0.0: + diff = abs((r1['date'] - r2['date']).days) + date_message = f'+/- {diff} days' + else: + date_message = 'date mismatch' + + if r1['amount'] == r2['amount']: + amount_score, amount_message = 2.0, '' + else: + amount_score, amount_message = 0.0, 'amount mismatch' + + if r2['check_id']: + payee_score = 1.0 if r1['payee'] == str(r2['check_id']) else 0 + payee_message = 'check # matched' + else: + payee_match_quality_payee = fuzz.token_set_ratio(r1['payee'], r2['payee']) + payee_match_quality_entity = fuzz.token_set_ratio(r1['payee'], r2['entity']) + payee_score = max(payee_match_quality_payee, payee_match_quality_entity) / 100.0 + if payee_score > 0.8: + payee_message = '' + elif payee_score > 0.5: + payee_message = 'partial payee match' + else: + payee_message = 'payee mismatch' + + overall_score = (date_score + amount_score + payee_score) / 4 + overall_message = [m for m in [date_message, amount_message, payee_message] if m] + return overall_score, overall_message # TODO: Is there a way to pull the side-effecting code out of this function? @@ -221,6 +323,9 @@ def parse_args(argv): def main(args): # TODO: Should put in a sanity check to make sure the statement you're feeding # in matches the account you've provided. + + # TODO: Can we open the files first, then pass the streams on to the rest of the program? + if 'AMEX' in args.account: standardize_statement_record = standardize_amex_record else: @@ -253,14 +358,16 @@ def main(args): # String concatenation looks bad, but there's no SQL injection possible here # because BQL can't write back to the Beancount files. I hope! - query = f"SELECT filename, META('lineno') AS posting_line, META('bank-statement') AS posting_statement, date, number(cost(position)), payee, narration where account = '{args.account}' and date >= {begin_date} and date <= {end_date}" + query = f"SELECT id, filename, META('lineno') AS line, META('bank-statement') AS bank_statement, date, number(cost(position)), payee, ANY_META('entity') as entity, ANY_META('check-id') as check_id, narration where account = '{args.account}' and date >= {begin_date} and date <= {end_date}" result_types, result_rows = run_query(entries, options, query) books_trans = sort_records([standardize_beancount_record(row) for row in result_rows]) - matches, metadata_to_apply, total_matched, total_missing_from_books, total_missing_from_statement = match_statement_and_books( + matches, metadata_to_apply, total_matched, total_missing_from_books, total_missing_from_statement = format_matches( statement_trans, books_trans, args.show_reconciled_matches, args.csv_statement) + # assert books_balance == books_balance_reconciled + total_matched + total_missing_from_statement + out = io.StringIO() print('-' * 155) print(f'{"Statement transaction":<38} {"Books transaction":<44} Notes') @@ -268,17 +375,14 @@ def main(args): for _, output in sorted(matches): print(output) print('-' * 155) - print(f'Period: {begin_date} to {end_date}') - print(f'Statement/cleared balance: {args.statement_balance:12,.2f} (as provided by you)') - print(f'Books balance (all): {books_balance:12,.2f} (all transactions, includes unreconciled)') - print(f'Books balance (reconciled): {books_balance_reconciled:12,.2f} (transactions with "bank-statement" tag only)') - print(f'Matched above: {total_matched:12,.2f} ("bank-statement" tag yet to be applied)') - print(f'On statement only: {total_missing_from_books:12,.2f} (no match in books)') - print(f'On books only: {total_missing_from_statement:12,.2f} (no match on statement)') + print(f'Statement period: {begin_date} to {end_date}') + print(f'Statement/cleared balance: {args.statement_balance:12,.2f} (as provided by you)') + print(f'Books balance (all): {books_balance:12,.2f} (all transactions, includes unreconciled)') + print(f'Books balance (reconciled): {books_balance_reconciled:12,.2f} (transactions with "bank-statement" tag only)') + print(f'Matched above: {total_matched:12,.2f} ("bank-statement" tag yet to be applied)') + print(f'Unmatched on books: {total_missing_from_statement:12,.2f}') + print(f'Unmatched statement: {total_missing_from_books:12,.2f}') print('-' * 155) - # print(f'Remaning to reconcile: {books_balance - books_balance_reconciled - total_matched:12,.2f}') - # print(f'Total reconciled inc. above: {books_balance_reconciled + total_matched:12,.2f}') - # print('-' * 155) # Write statement metadata back to books if metadata_to_apply and not args.non_interactive: diff --git a/tests/test_reconcile.py b/tests/test_reconcile.py new file mode 100644 index 0000000..5f75d3a --- /dev/null +++ b/tests/test_reconcile.py @@ -0,0 +1,175 @@ +import datetime +import decimal + +from conservancy_beancount.reconcile.prototype_amex_reconciler import match_statement_and_books2 as match_statement_and_books, remove_payee_junk, date_proximity + +S1 = { + 'date': datetime.date(2022, 1, 1), + 'amount': decimal.Decimal('10.00'), + 'payee': 'Patreon / Patreon / 123456/ ST-A1B2C3D4G5H6 /', + 'line': 222, +} +S2 = { + 'date': datetime.date(2022, 1, 2), + 'amount': decimal.Decimal('20.00'), + 'payee': 'BT*LINODE PHILADELPHIA P', + 'line': 333, +} +S3 = { + 'date': datetime.date(2022, 1, 3), + 'amount': decimal.Decimal('30.00'), + 'payee': 'USPS PO 4067540039 0PORTLAND OR', + 'line': 444, +} + +B1 = { + 'date': datetime.date(2022, 1, 1), + 'amount': decimal.Decimal('10.00'), + 'payee': 'Patreon', + 'entity': '', + 'check_id': None, + 'filename': '2022/imports.beancount', + 'line': 777, + 'bank_statement': "Financial/Bank-Statements/AMEX/2022-01-12_AMEX_statement.pdf" +} +B2 = { + 'date': datetime.date(2022, 1, 2), + 'amount': decimal.Decimal('20.00'), + 'payee': 'Linode', + 'entity': '', + 'check_id': None, + 'filename': '2022/main.beancount', + 'line': 888, + 'bank_statement': "Financial/Bank-Statements/AMEX/2022-01-12_AMEX_statement.pdf" +} +B3_next_day = { + 'date': datetime.date(2022, 1, 4), + 'amount': decimal.Decimal('30.00'), + 'payee': 'USPS', + 'entity': '', + 'check_id': None, + 'filename': '2022/main.beancount', + 'line': 999, + 'bank_statement': "Financial/Bank-Statements/AMEX/2022-01-12_AMEX_statement.pdf" +} +B3_next_week = { + 'date': datetime.date(2022, 1, 10), + 'amount': decimal.Decimal('30.00'), + 'payee': 'USPS', + 'entity': '', + 'check_id': None, + 'filename': '2022/main.beancount', + 'line': 999, + 'bank_statement': "Financial/Bank-Statements/AMEX/2022-01-12_AMEX_statement.pdf" +} +B3_mismatch_amount = { + 'date': datetime.date(2022, 1, 3), + 'amount': decimal.Decimal('31.00'), + 'payee': 'USPS', + 'entity': '', + 'check_id': None, + 'filename': '2022/main.beancount', + 'line': 999, + 'bank_statement': "Financial/Bank-Statements/AMEX/2022-01-12_AMEX_statement.pdf" +} +B3_payee_mismatch_1 = { + 'date': datetime.date(2022, 1, 3), + 'amount': decimal.Decimal('30.00'), + 'payee': 'Credit X', + 'entity': '', + 'check_id': None, + 'filename': '2022/main.beancount', + 'line': 999, + 'bank_statement': "Financial/Bank-Statements/AMEX/2022-01-12_AMEX_statement.pdf" +} +B3_payee_mismatch_2 = { + 'date': datetime.date(2022, 1, 3), + 'amount': decimal.Decimal('30.00'), + 'payee': 'Credit Y', + 'entity': '', + 'check_id': None, + 'filename': '2022/main.beancount', + 'line': 999, + 'bank_statement': "Financial/Bank-Statements/AMEX/2022-01-12_AMEX_statement.pdf" +} + + +def test_one_exact_match(): + statement = [S1] + books = [B1] + assert match_statement_and_books(statement, books) == [ + ([S1], [B1], []), + ] + +def test_multiple_exact_matches(): + statement = [S1, S2] + books = [B1, B2] + assert match_statement_and_books(statement, books) == [ + ([S1], [B1], []), + ([S2], [B2], []), + ] + +def test_one_mismatch(): + statement = [S1] + books = [] + assert match_statement_and_books(statement, books) == [ + ([S1], [], ['no match']), + ] + +def test_multiple_mismatches(): + statement = [S1] + books = [B2] + assert match_statement_and_books(statement, books) == [ + ([S1], [], ['no match']), + ([], [B2], ['no match']), + ] + +def test_next_day_matches(): + statement = [S3] + books = [B3_next_day] + assert match_statement_and_books(statement, books) == [ + ([S3], [B3_next_day], ['+/- 1 days']), + ] + +def test_next_week_matches(): + statement = [S3] + books = [B3_next_week] + assert match_statement_and_books(statement, books) == [ + ([S3], [B3_next_week], ['+/- 7 days']), + ] + +def test_incorrect_amount_does_not_match(): + statement = [S3] + books = [B3_mismatch_amount] + assert match_statement_and_books(statement, books) == [ + ([S3], [], ['no match']), + ([], [B3_mismatch_amount], ['no match']), + ] + +def test_payee_mismatch_ok_when_only_one_that_amount_and_date(): + statement = [S3] + books = [B3_payee_mismatch_1] + assert match_statement_and_books(statement, books) == [ + ([S3], [B3_payee_mismatch_1], ['payee mismatch', 'only one decent match']), + ] + +def test_payee_mismatch_not_ok_when_multiple_that_amount_and_date(): + statement = [S3] + books = [B3_payee_mismatch_1, B3_payee_mismatch_2] + match = match_statement_and_books(statement, books) + assert match == [ + ([S3], [], ['no match']), + ([], [B3_payee_mismatch_1], ['no match']), + ([], [B3_payee_mismatch_2], ['no match']), + ] + +# def test_subset_sum_with_same_date_and_payee(): + +def test_remove_payee_junk(): + assert remove_payee_junk('WIDGETSRUS INC PAYMENT 1') == 'WIDGETSRUS' + assert remove_payee_junk('0000010017') == '10017' + +def test_date_proximity(): + assert date_proximity(datetime.date(2021, 8, 23), datetime.date(2021, 8, 23)) == 1.0 + assert date_proximity(datetime.date(2021, 8, 23), datetime.date(2021, 8, 23) - datetime.timedelta(days=30)) == 0.5 + assert date_proximity(datetime.date(2021, 8, 23), datetime.date(2021, 8, 23) - datetime.timedelta(days=60)) == 0.0