From 405dd553cb0006f3dd9f54f510df0bc0a98a7b00 Mon Sep 17 00:00:00 2001
From: Ben Sturmfels <ben@sturm.com.au>
Date: Wed, 23 Feb 2022 17:24:31 +1100
Subject: [PATCH] reconcile: Implement "subset sum" feature matching multiple
 books entries to a single statement entry.

---
 .../reconcile/statement_reconciler.py         | 111 ++++++++++++----
 tests/test_reconcile.py                       | 125 +++++++++++++-----
 2 files changed, 171 insertions(+), 65 deletions(-)

diff --git a/conservancy_beancount/reconcile/statement_reconciler.py b/conservancy_beancount/reconcile/statement_reconciler.py
index 0cca0d9..897c369 100644
--- a/conservancy_beancount/reconcile/statement_reconciler.py
+++ b/conservancy_beancount/reconcile/statement_reconciler.py
@@ -49,6 +49,7 @@ import csv
 import datetime
 import decimal
 import io
+import itertools
 import logging
 import os
 import re
@@ -57,6 +58,7 @@ from typing import Callable, Dict, List, Tuple, TextIO
 
 from beancount import loader
 from beancount.query.query import run_query
+from colorama import Fore, Style
 
 if not sys.warnoptions:
     import warnings
@@ -164,21 +166,26 @@ def standardize_beancount_record(row) -> Dict:  # type: ignore[no-untyped-def]
     }
 
 
-def format_record(records: list[dict]) -> str:
-    if len(records) == 1:
-        record = records[0]
-
-        if record['payee'] and record['check_id']:
-            output = f"{record['date'].isoformat()}: {record['amount']:11,.2f} {record['payee'][:25]} #{record['check_id']}".ljust(59)
-        elif record['payee']:
-            output = f"{record['date'].isoformat()}: {record['amount']:11,.2f} {record['payee'][:35]}".ljust(59)
-        else:
-            output = f"{record['date'].isoformat()}: {record['amount']:11,.2f} #{record['check_id']}".ljust(59)
-        return output
+def format_record(record: dict) -> str:
+    if record['payee'] and record['check_id']:
+        output = f"{record['date'].isoformat()}: {record['amount']:11,.2f} {record['payee'][:25]} #{record['check_id']}".ljust(59)
+    elif record['payee']:
+        output = f"{record['date'].isoformat()}: {record['amount']:11,.2f} {record['payee'][:35]}".ljust(59)
     else:
-        raise NotImplementedError
+        output = f"{record['date'].isoformat()}: {record['amount']:11,.2f} #{record['check_id']}".ljust(59)
+    return output
 
 
+def format_multirecord(r1s, r2s, note):
+    total = sum(x['amount'] for x in r2s)
+    assert len(r1s) == 1
+    assert len(r2s) > 1
+    match_output = []
+    match_output.append([r1s[0]['date'].isoformat() + ' ' + r1s[0]['payee'], f'{format_record(r1s[0])}  →  {format_record(r2s[0])}  ✓ Matched{note}'])
+    for i, r2 in enumerate(r2s[1:]):
+        match_output.append([r1s[0]['date'].isoformat() + str(i) + r1s[0]['payee'], f'{r1s[0]["date"].isoformat()}:             ↳                                    →  {format_record(r2)}  ✓ Matched{note}'])
+    return match_output
+
 def sort_records(records: List) -> List:
     return sorted(records, key=lambda x: (x['date'], x['amount']))
 
@@ -253,13 +260,7 @@ def match_statement_and_books(statement_trans: list, books_trans: list):
 
     """
     matches = []
-    # We need a realised list and should be a copy so we can safely delete
-    # items.
-    books_trans = list(books_trans)
-
-    # We can delete the matched books trans, but seems not a good idea to delete
-    # while iterating through statement_trans. Instead pushing onto a separate
-    # list.
+    remaining_books_trans = []
     remaining_statement_trans = []
 
     for r1 in statement_trans:
@@ -278,12 +279,13 @@ def match_statement_and_books(statement_trans: list, books_trans: list):
             if best_match_score <= 0.8:
                 best_match_note.append('only one decent match')
             matches.append(([r1], [books_trans[best_match_index]], best_match_note))
+            # Don't try to make a second match against this books entry.
             del books_trans[best_match_index]
         else:
-            matches.append(([r1], [], ['no match']))
+            remaining_statement_trans.append(r1)
     for r2 in books_trans:
-        matches.append(([], [r2], ['no match']))
-    return matches
+        remaining_books_trans.append(r2)
+    return matches, remaining_statement_trans, remaining_books_trans
 
 
 def format_matches(matches, csv_statement: str, show_reconciled_matches):
@@ -292,12 +294,15 @@ def format_matches(matches, csv_statement: str, show_reconciled_matches):
         note = ', '.join(note)
         note = ': ' + note if note else note
         if r1 and r2:
-            if show_reconciled_matches:
-                match_output.append([r1[0]['date'], f'{format_record(r1)}  →  {format_record(r2)}  ✓ Matched{note}'])
+            if show_reconciled_matches and all(x['bank_statement'] for x in r2):
+                if len(r2) == 1:
+                    match_output.append([r1[0]['date'].isoformat() + r1[0]['payee'], f'{format_record(r1[0])}  →  {format_record(r2[0])}  ✓ Matched{note}'])
+                else:
+                    match_output.extend(format_multirecord(r1, r2, note))
         elif r1:
-            match_output.append([r1[0]['date'], f'{format_record(r1)}  →  {" ":^59}  ✗ NOT IN BOOKS ({os.path.basename(csv_statement)}:{r1[0]["line"]})'])
+            match_output.append([r1[0]['date'].isoformat() + r1[0]['payee'], Fore.RED + Style.BRIGHT + f'{format_record(r1[0])}  →  {" ":^59}  ✗ NOT IN BOOKS ({os.path.basename(csv_statement)}:{r1[0]["line"]})' + Style.RESET_ALL])
         else:
-            match_output.append([r2[0]['date'], f'{" ":^59}  →  {format_record(r2)}  ✗ NOT ON STATEMENT ({os.path.basename(r2[0]["filename"])}:{r2[0]["line"]})'])
+            match_output.append([r2[0]['date'].isoformat() + r2[0]['payee'], Fore.RED + Style.BRIGHT + f'{" ":^59}  →  {format_record(r2[0])}  ✗ NOT ON STATEMENT ({os.path.basename(r2[0]["filename"])}:{r2[0]["line"]})' + Style.RESET_ALL])
     return match_output
 
 
@@ -400,6 +405,51 @@ def totals(matches):
             total_missing_from_statement += sum(c['amount'] for c in books_entries)
     return total_matched, total_missing_from_books, total_missing_from_statement
 
+
+def subset_match(statement_trans, books_trans):
+    matches = []
+    remaining_books_trans = []
+    remaining_statement_trans = []
+
+    groups = itertools.groupby(books_trans, key=lambda x: (x['date'], x['payee']))
+    for k, group in groups:
+        best_match_score = 0
+        best_match_index = None
+        best_match_note = ''
+        matches_found = 0
+
+        group_items = list(group)
+        total = sum(x['amount'] for x in group_items)
+        r2 = copy.copy(group_items[0])
+        r2['amount'] = total
+        for i, r1 in enumerate(statement_trans):
+            score, note = records_match(r1, r2)
+            if score >= 0.5 and score >= best_match_score:
+                matches_found += 1
+                best_match_score = score
+                best_match_index = i
+                best_match_note = note
+        if best_match_score > 0.5 and matches_found == 1 and 'check-id mismatch' not in best_match_note or best_match_score > 0.8:
+            if best_match_score <= 0.8:
+                best_match_note.append('only one decent match')
+            matches.append(([statement_trans[best_match_index]], group_items, best_match_note))
+            del statement_trans[best_match_index]
+            for item in group_items:
+                books_trans.remove(item)
+        else:
+            remaining_books_trans.append(r2)
+    for r1 in statement_trans:
+         remaining_statement_trans.append(r1)
+    return matches, remaining_statement_trans, remaining_books_trans
+
+def process_unmatched(statement_trans, books_trans):
+    matches = []
+    for r1 in statement_trans:
+        matches.append(([r1], [], ['no match']))
+    for r2 in books_trans:
+        matches.append(([], [r2], ['no match']))
+    return matches
+
 def main(args):
     # TODO: Should put in a sanity check to make sure the statement you're feeding
     # in matches the account you've provided.
@@ -443,10 +493,13 @@ def main(args):
 
     books_trans = sort_records([standardize_beancount_record(row) for row in result_rows])
 
-    matches = match_statement_and_books(statement_trans, books_trans)
-    match_output = format_matches(matches, args.csv_statement, args.show_reconciled_matches)
+    matches, remaining_statement_trans, remaining_books_trans = match_statement_and_books(statement_trans, books_trans)
+    subset_matches, remaining_statement_trans, remaining_books_trans = subset_match(remaining_statement_trans, remaining_books_trans)
+    matches.extend(subset_matches)
+    unmatched = process_unmatched(remaining_statement_trans, remaining_books_trans)
+    matches.extend(unmatched)
 
-    # assert books_balance == books_balance_reconciled + total_matched + total_missing_from_statement
+    match_output = format_matches(matches, args.csv_statement, args.show_reconciled_matches)
 
     total_matched, total_missing_from_books, total_missing_from_statement = totals(matches)
 
diff --git a/tests/test_reconcile.py b/tests/test_reconcile.py
index 7703921..fc00ef5 100644
--- a/tests/test_reconcile.py
+++ b/tests/test_reconcile.py
@@ -13,6 +13,7 @@ from conservancy_beancount.reconcile.statement_reconciler import (
     metadata_for_match,
     write_metadata_to_books,
     totals,
+    subset_match,
 )
 
 # These data structures represent individual transactions as taken from the
@@ -40,6 +41,13 @@ S3 = {
     'check_id': '',
     'line': 444,
 }
+S4 = {
+    'date': datetime.date(2022, 8, 11),
+    'amount': decimal.Decimal('-2260.00'),
+    'payee': 'Trust 0000000362 210',
+    'check_id': '',
+    'line': 555,
+}
 
 # Books transaction examples.
 B1 = {
@@ -114,81 +122,116 @@ B3_unmatched_check_id = {
     'line': 999,
     'bank_statement': "Financial/Bank-Statements/AMEX/2022-01-12_AMEX_statement.pdf"
 }
+B4A = {
+    'date': datetime.date(2022, 8, 11),
+    'amount': decimal.Decimal('-250.00'),
+    'payee': 'TRUST 0000000362 ACH Retirement Plan',
+    'check_id': '',
+    'line': 1000,
+}
+B4B = {
+    'date': datetime.date(2022, 8, 11),
+    'amount': decimal.Decimal('-250.00'),
+    'payee': 'TRUST 0000000362 ACH Retirement Plan',
+    'check_id': '',
+    'line': 1000,
+}
+B4C = {
+    'date': datetime.date(2022, 8, 11),
+    'amount': decimal.Decimal('-1760.00'),
+    'payee': 'TRUST 0000000362 ACH Retirement Plan',
+    'check_id': '',
+    'line': 1000,
+}
+
 
 
 def test_one_exact_match():
     statement = [S1]
     books = [B1]
-    assert match_statement_and_books(statement, books) == [
+    assert match_statement_and_books(statement, books) == (
         # Match, match, notes.
         #
         # The matches are a list so we can implement subset-sum matching where
         # multiple books transactions may match to a single statement
         # transaction.
-        ([S1], [B1], []),
-    ]
+        [([S1], [B1], [])],
+        [],
+        [],
+    )
 
 def test_multiple_exact_matches():
     statement = [S1, S2]
     books = [B1, B2]
-    assert match_statement_and_books(statement, books) == [
-        ([S1], [B1], []),
-        ([S2], [B2], []),
-    ]
+    assert match_statement_and_books(statement, books) == (
+        [([S1], [B1], []), ([S2], [B2], [])],
+        [],
+        [],
+    )
 
 def test_one_mismatch():
     statement = [S1]
     books = []
-    assert match_statement_and_books(statement, books) == [
-        ([S1], [], ['no match']),
-    ]
+    assert match_statement_and_books(statement, books) == (
+        [],
+        [S1],
+        [],
+    )
 
 def test_multiple_mismatches():
     statement = [S1]
     books = [B2]
-    assert match_statement_and_books(statement, books) == [
-        ([S1], [], ['no match']),
-        ([], [B2], ['no match']),
-    ]
+    assert match_statement_and_books(statement, books) == (
+        [],
+        [S1],
+        [B2],
+    )
 
 def test_next_day_matches():
     statement = [S3]
     books = [B3_next_day]
-    assert match_statement_and_books(statement, books) == [
-        ([S3], [B3_next_day], ['+/- 1 days']),
-    ]
+    assert match_statement_and_books(statement, books) == (
+        [([S3], [B3_next_day], ['+/- 1 days'])],
+        [],
+        [],
+    )
 
 def test_next_week_matches():
     statement = [S3]
     books = [B3_next_week]
-    assert match_statement_and_books(statement, books) == [
-        ([S3], [B3_next_week], ['+/- 7 days']),
-    ]
+    assert match_statement_and_books(statement, books) == (
+        [([S3], [B3_next_week], ['+/- 7 days'])],
+        [],
+        [],
+    )
 
 def test_incorrect_amount_does_not_match():
     statement = [S3]
     books = [B3_mismatch_amount]
-    assert match_statement_and_books(statement, books) == [
-        ([S3], [], ['no match']),
-        ([], [B3_mismatch_amount], ['no match']),
-    ]
+    assert match_statement_and_books(statement, books) == (
+        [],
+        [S3],
+        [B3_mismatch_amount],
+    )
 
 def test_payee_mismatch_ok_when_only_one_that_amount_and_date():
     statement = [S3]
     books = [B3_payee_mismatch_1]
-    assert match_statement_and_books(statement, books) == [
-        ([S3], [B3_payee_mismatch_1], ['payee mismatch', 'only one decent match']),
-    ]
+    assert match_statement_and_books(statement, books) == (
+        [([S3], [B3_payee_mismatch_1], ['payee mismatch', 'only one decent match'])],
+        [],
+        [],
+    )
 
 def test_payee_mismatch_not_ok_when_multiple_that_amount_and_date():
     statement = [S3]
     books = [B3_payee_mismatch_1, B3_payee_mismatch_2]
     match = match_statement_and_books(statement, books)
-    assert match == [
-        ([S3], [], ['no match']),
-        ([], [B3_payee_mismatch_1], ['no match']),
-        ([], [B3_payee_mismatch_2], ['no match']),
-    ]
+    assert match == (
+        [],
+        [S3],
+        [B3_payee_mismatch_1, B3_payee_mismatch_2],
+    )
 
 def test_remove_payee_junk():
     assert remove_payee_junk('WIDGETSRUS INC PAYMENT 1') == 'WIDGETSRUS'
@@ -251,7 +294,17 @@ def test_payee_not_considered_if_check_id_present():
     # These records match aside from check-id.
     statement = [S3]
     books = [B3_unmatched_check_id]
-    assert match_statement_and_books(statement, books) == [
-        ([S3], [], ['no match']),
-        ([], [B3_unmatched_check_id], ['no match']),
-    ]
+    assert match_statement_and_books(statement, books) == (
+        [],
+        [S3],
+        [B3_unmatched_check_id],
+    )
+
+def test_subset_sum_match():
+    statement = [S4]
+    books = [B4A, B4B, B4C]
+    assert subset_match(statement, books) == (
+        [([S4], [B4A, B4B, B4C], [])],
+        [],  # No remaining statement trans.
+        [],  # No remaining books trans.
+    )