From d8f4eac53bb7dc66c16d780156cbd785fa9333e9 Mon Sep 17 00:00:00 2001
From: Ben Sturmfels <ben@sturm.com.au>
Date: Fri, 18 Feb 2022 20:36:11 +1100
Subject: [PATCH] reconcile: Improve reconciler matching and add test cases.

---
 .../reconcile/prototype_amex_reconciler.py    | 220 +++++++++++++-----
 tests/test_reconcile.py                       | 175 ++++++++++++++
 2 files changed, 337 insertions(+), 58 deletions(-)
 create mode 100644 tests/test_reconcile.py

diff --git a/conservancy_beancount/reconcile/prototype_amex_reconciler.py b/conservancy_beancount/reconcile/prototype_amex_reconciler.py
index 5a9b012..487d068 100644
--- a/conservancy_beancount/reconcile/prototype_amex_reconciler.py
+++ b/conservancy_beancount/reconcile/prototype_amex_reconciler.py
@@ -44,11 +44,14 @@ TODO/ISSUES:
 """
 import argparse
 import collections
+import copy
 import csv
 import datetime
 import decimal
 import io
+import logging
 import os
+import re
 import sys
 from typing import Callable, Dict, List, Tuple, TextIO
 
@@ -62,6 +65,49 @@ if not sys.warnoptions:
     warnings.filterwarnings('ignore', category=UserWarning, module='thefuzz.fuzz')
 from thefuzz import fuzz  # type: ignore
 
+logger = logging.getLogger()
+logger.setLevel(logging.DEBUG)
+
+# Console logging.
+logger.addHandler(logging.StreamHandler())
+
+
+JUNK_WORDS = [
+    'software',
+    'freedom',
+    'conservancy',
+    'conse',
+    'payment',
+    'echeck',
+    'bill',
+    'debit',
+    'wire',
+    'credit',
+    "int'l",
+    "in.l",
+    'llc',
+    'online',
+    'donation',
+]
+JUNK_WORDS_RES = [re.compile(word, re.IGNORECASE) for word in JUNK_WORDS]
+ZERO_RE = re.compile('^0+')
+
+def remove_payee_junk(payee):
+    for r in JUNK_WORDS_RES:
+        payee = r.sub('', payee)
+    payee = ZERO_RE.sub('', payee)
+    payee = payee.replace(' - ', ' ')
+    payee = re.sub(r'\.0\.\d+', ' ', payee)
+    payee = payee.replace('.0', ' ')
+    payee = payee.replace('/', ' ')
+    payee = re.sub(re.escape('.com'), ' ', payee, flags=re.IGNORECASE)
+    payee = re.sub(re.escape('.net'), ' ', payee, flags=re.IGNORECASE)
+    payee = payee.replace('*', ' ')
+    payee = ' '.join([i for i in payee.split(' ') if len(i) > 3])
+    payee = payee.replace('-', ' ')
+    payee.strip()
+    return payee
+
 # NOTE: Statement doesn't seem to give us a running balance or a final total.
 
 def read_transactions_from_csv(f: TextIO, standardize_statement_record: Callable) -> list:
@@ -74,8 +120,10 @@ def standardize_amex_record(row: Dict, line: int) -> Dict:
     return {
         'date': datetime.datetime.strptime(row['Date'], '%m/%d/%Y').date(),
         'amount': -1 * decimal.Decimal(row['Amount']),
+        # Descriptions have too much noise, so taking just the start
+        # significantly assists the fuzzy matching.
+        'payee': remove_payee_junk(row['Description'] or '')[:25],
         'line': line,
-        'payee': row['Description'] or '',
     }
 
 
@@ -83,39 +131,46 @@ def standardize_fr_record(row: Dict, line: int) -> Dict:
     return {
         'date': datetime.datetime.strptime(row['Date'], '%m/%d/%Y').date(),
         'amount': decimal.Decimal(row['Amount']),
-        'payee': row['Detail'] or row['Description'],
+        'payee': remove_payee_junk(row['Detail'] or row['Serial Num'])[:20],
         'line': line,
     }
 
 
 def standardize_beancount_record(row) -> Dict:  # type: ignore[no-untyped-def]
     """Turn a Beancount query result row into a standard dict representing a transaction."""
+    # if '0000000362' in row.narration:
+    #     import pdb; pdb.set_trace()
     return {
         'date': row.date,
         'amount': row.number_cost_position,
-        'payee': row.payee if row.payee else row.narration,
+        'payee': remove_payee_junk(row.payee or row.narration)[:30],
+        'entity': row.entity,
+        'check_id': row.check_id,
         'filename': row.filename,
-        'line': row.posting_line,
-        'statement': row.posting_statement,
+        'line': row.line,
+        'bank_statement': row.bank_statement,
     }
 
 
-def format_record(record: Dict) -> str:
-    return f"{record['date'].isoformat()}: {record['amount']:12,.2f} {record['payee'][:20]:<20}"
+def format_record(records: list[dict]) -> str:
+    if len(records) == 1:
+        record = records[0]
+        return f"{record['date'].isoformat()}: {record['amount']:12,.2f} {record['payee'][:25]:<25}"
+    else:
+        raise NotImplementedError
 
 
 def sort_records(records: List) -> List:
     return sorted(records, key=lambda x: (x['date'], x['amount']))
 
 
-def match_statement_and_books(statement_trans: list, books_trans: list, show_reconciled_matches: bool, csv_statement: str) -> tuple[list, list, decimal.Decimal]:
+def match_statement_and_books2(statement_trans: list, books_trans: list):
     matches = []
-    metadata_to_apply = []
-    total_matched = decimal.Decimal(0)
-    total_missing_from_books = decimal.Decimal(0)
-    total_missing_from_statement = decimal.Decimal(0)
+    # We need a realised list and should be a copy so we can safely delete
+    # items.
+    books_trans = list(books_trans)
 
-    # Run through all the statement transactions to find a matching transaction in
+    # run through all the statement transactions to find a matching transaction in
     # the books. If found, the books transaction is marked off so that it can only
     # be matched once. Some transactions will be matched, some will be on the
     # statement but not the books and some on the books but not the statement.
@@ -125,49 +180,96 @@ def match_statement_and_books(statement_trans: list, books_trans: list, show_rec
     # the books transactions to find an closely matching payee, then do another pass
     # disregarding payee.
 
-    # TODO: What if th
     for r1 in statement_trans:
-        for r2 in books_trans:
-            match, note = records_match(r1, r2)
-            if match:
-                if not r2['statement'] or show_reconciled_matches:
-                        matches.append([r2['date'], f'{format_record(r1)}  →  {format_record(r2)}  ✓ {note}'])
-                        total_matched += r2['amount']
-                if not r2['statement']:
-                    metadata_to_apply.append((r2['filename'], r2['line'], f'    bank-statement: "{os.path.basename(csv_statement)}:{r2["line"]}"\n'))
-                books_trans.remove(r2)
-                break
+        best_match_score = 0
+        best_match_index = None
+        best_match_note = ''
+        matches_found = 0
+        for i, r2 in enumerate(books_trans):
+            score, note = records_match2(r1, r2)
+            if score >= 0.5 and score >= best_match_score:
+                matches_found += 1
+                best_match_score = score
+                best_match_index = i
+                best_match_note = note
+                # if note == 'payee mismatch':
+                #     payee_only_mismatches += 1
+        if best_match_score > 0.5 and matches_found == 1 and 'payee_mismatch' not in best_match_note or best_match_score > 0.8:
+            if best_match_score <= 0.8:
+                best_match_note.append('only one decent match')
+            matches.append(([r1], [books_trans[best_match_index]], best_match_note))
+            del books_trans[best_match_index]
         else:
-            matches.append([r1['date'], f'{format_record(r1)}  →  {" ":^45}  ✗ Not in books ({os.path.basename(csv_statement)}:{r1["line"]})'])
-            total_missing_from_books += r1['amount']
+            matches.append(([r1], [], ['no match']))
     for r2 in books_trans:
-        matches.append([r2['date'], f'{" ":^45}  →  {format_record(r2)}  ✗ Not on statement ({os.path.basename(r2["filename"])}:{r2["line"]})'])
-        total_missing_from_statement += r2['amount']
-    return matches, metadata_to_apply, total_matched, total_missing_from_books, total_missing_from_statement
+        matches.append(([], [r2], ['no match']))
+    return matches
+
+def format_matches(statement_trans, books_trans, show_reconciled_matches: bool, csv_statement: str):
+    match_output = []
+    metadata_to_apply = []
+    total_matched = decimal.Decimal(0)
+    total_missing_from_books = decimal.Decimal(0)
+    total_missing_from_statement = decimal.Decimal(0)
+
+    matches = match_statement_and_books2(statement_trans, books_trans)
+    for r1, r2, note in matches:
+        note = ', '.join(note)
+        note = ': ' + note if note else note
+        if r1 and r2:
+            match_output.append([r1[0]['date'], f'{format_record(r1)}  →  {format_record(r2)}  ✓ Matched{note}'])
+        elif r1:
+            match_output.append([r1[0]['date'], f'{format_record(r1)}  →  {" ":^50}  ✗ Not in books ({os.path.basename(csv_statement)}:{r1[0]["line"]})'])
+        else:
+            match_output.append([r2[0]['date'], f'{" ":^50}  →  {format_record(r2)}  ✗ Not on statement ({os.path.basename(r2[0]["filename"])}:{r2[0]["line"]})'])
+    return match_output, metadata_to_apply, total_matched, total_missing_from_books, total_missing_from_statement
 
-# TODO: Time for some test cases I think.
 
 # TODO: Could potentially return a score so that we can find the best match from
 # a pool of candidates. How would be then remove that candidate from the global
 # pool?
 
-def records_match(r1: Dict, r2: Dict) -> Tuple[bool, str]:
-    """Do these records represent the same transaction?"""
-    date_matches_exactly = r1['date'] == r2['date']
-    date_matches_loosly = r1['date'] >= r2['date'] - datetime.timedelta(days=3) and r1['date'] <= r2['date'] + datetime.timedelta(days=3)
-    amount_matches = r1['amount'] == r2['amount']
-    payee_match_quality = fuzz.token_set_ratio(r1['payee'], r2['payee'])
-    payee_matches = payee_match_quality > 50
-    if date_matches_exactly and amount_matches and payee_matches:
-        return True, 'Matched'
-    elif date_matches_loosly and amount_matches and payee_matches:
-        return True, 'Matched +/- 3 days'
-    elif date_matches_exactly and amount_matches:
-        return True, f'Matched ignoring payee'
-    elif date_matches_loosly and amount_matches:
-        return True, f'Matched +/- 3 days, ignoring payee'
+def date_proximity(d1, d2):
+    diff = abs((d1 - d2).days)
+    if diff > 60:
+        return 0
     else:
-        return False, ''
+        return 1.0 - (diff / 60.0)
+
+def records_match2(r1: Dict, r2: Dict) -> Tuple[bool, str]:
+    """Do these records represent the same transaction?"""
+
+    date_score = date_proximity(r1['date'], r2['date'])
+    if r1['date'] == r2['date']:
+        date_message = ''
+    elif date_score > 0.0:
+        diff = abs((r1['date'] - r2['date']).days)
+        date_message = f'+/- {diff} days'
+    else:
+        date_message = 'date mismatch'
+
+    if r1['amount'] == r2['amount']:
+        amount_score, amount_message = 2.0, ''
+    else:
+        amount_score, amount_message = 0.0, 'amount mismatch'
+
+    if r2['check_id']:
+        payee_score = 1.0 if r1['payee'] == str(r2['check_id']) else 0
+        payee_message = 'check # matched'
+    else:
+        payee_match_quality_payee = fuzz.token_set_ratio(r1['payee'], r2['payee'])
+        payee_match_quality_entity = fuzz.token_set_ratio(r1['payee'], r2['entity'])
+        payee_score = max(payee_match_quality_payee, payee_match_quality_entity) / 100.0
+        if payee_score > 0.8:
+            payee_message = ''
+        elif payee_score > 0.5:
+            payee_message = 'partial payee match'
+        else:
+            payee_message = 'payee mismatch'
+
+    overall_score = (date_score + amount_score + payee_score) / 4
+    overall_message = [m for m in [date_message, amount_message, payee_message] if m]
+    return overall_score, overall_message
 
 
 # TODO: Is there a way to pull the side-effecting code out of this function?
@@ -221,6 +323,9 @@ def parse_args(argv):
 def main(args):
     # TODO: Should put in a sanity check to make sure the statement you're feeding
     # in matches the account you've provided.
+
+    # TODO: Can we open the files first, then pass the streams on to the rest of the program?
+
     if 'AMEX' in args.account:
         standardize_statement_record = standardize_amex_record
     else:
@@ -253,14 +358,16 @@ def main(args):
 
     # String concatenation looks bad, but there's no SQL injection possible here
     # because BQL can't write back to the Beancount files. I hope!
-    query = f"SELECT filename, META('lineno') AS posting_line, META('bank-statement') AS posting_statement, date, number(cost(position)), payee, narration where account = '{args.account}' and date >= {begin_date} and date <= {end_date}"
+    query = f"SELECT id, filename, META('lineno') AS line, META('bank-statement') AS bank_statement, date, number(cost(position)), payee, ANY_META('entity') as entity, ANY_META('check-id') as check_id, narration where account = '{args.account}' and date >= {begin_date} and date <= {end_date}"
     result_types, result_rows = run_query(entries, options, query)
 
     books_trans = sort_records([standardize_beancount_record(row) for row in result_rows])
 
-    matches, metadata_to_apply, total_matched, total_missing_from_books, total_missing_from_statement = match_statement_and_books(
+    matches, metadata_to_apply, total_matched, total_missing_from_books, total_missing_from_statement = format_matches(
         statement_trans, books_trans, args.show_reconciled_matches, args.csv_statement)
 
+    # assert books_balance == books_balance_reconciled + total_matched + total_missing_from_statement
+
     out = io.StringIO()
     print('-' * 155)
     print(f'{"Statement transaction":<38}            {"Books transaction":<44}   Notes')
@@ -268,17 +375,14 @@ def main(args):
     for _, output in sorted(matches):
         print(output)
     print('-' * 155)
-    print(f'Period: {begin_date} to {end_date}')
-    print(f'Statement/cleared balance:  {args.statement_balance:12,.2f}    (as provided by you)')
-    print(f'Books balance (all):        {books_balance:12,.2f}    (all transactions, includes unreconciled)')
-    print(f'Books balance (reconciled): {books_balance_reconciled:12,.2f}    (transactions with "bank-statement" tag only)')
-    print(f'Matched above:              {total_matched:12,.2f}    ("bank-statement" tag yet to be applied)')
-    print(f'On statement only:          {total_missing_from_books:12,.2f}    (no match in books)')
-    print(f'On books only:              {total_missing_from_statement:12,.2f}    (no match on statement)')
+    print(f'Statement period: {begin_date} to {end_date}')
+    print(f'Statement/cleared balance:    {args.statement_balance:12,.2f}    (as provided by you)')
+    print(f'Books balance (all):          {books_balance:12,.2f}    (all transactions, includes unreconciled)')
+    print(f'Books balance (reconciled):   {books_balance_reconciled:12,.2f}    (transactions with "bank-statement" tag only)')
+    print(f'Matched above:                {total_matched:12,.2f}    ("bank-statement" tag yet to be applied)')
+    print(f'Unmatched on books:           {total_missing_from_statement:12,.2f}')
+    print(f'Unmatched statement:          {total_missing_from_books:12,.2f}')
     print('-' * 155)
-    # print(f'Remaning to reconcile:          {books_balance - books_balance_reconciled - total_matched:12,.2f}')
-    # print(f'Total reconciled inc. above:    {books_balance_reconciled + total_matched:12,.2f}')
-    # print('-' * 155)
 
     # Write statement metadata back to books
     if metadata_to_apply and not args.non_interactive:
diff --git a/tests/test_reconcile.py b/tests/test_reconcile.py
new file mode 100644
index 0000000..5f75d3a
--- /dev/null
+++ b/tests/test_reconcile.py
@@ -0,0 +1,175 @@
+import datetime
+import decimal
+
+from conservancy_beancount.reconcile.prototype_amex_reconciler import match_statement_and_books2 as match_statement_and_books, remove_payee_junk, date_proximity
+
+S1 = {
+    'date': datetime.date(2022, 1, 1),
+    'amount': decimal.Decimal('10.00'),
+    'payee': 'Patreon         / Patreon   / 123456/ ST-A1B2C3D4G5H6       /',
+    'line': 222,
+}
+S2 = {
+    'date': datetime.date(2022, 1, 2),
+    'amount': decimal.Decimal('20.00'),
+    'payee': 'BT*LINODE           PHILADELPHIA        P',
+    'line': 333,
+}
+S3 = {
+    'date': datetime.date(2022, 1, 3),
+    'amount': decimal.Decimal('30.00'),
+    'payee': 'USPS PO 4067540039 0PORTLAND            OR',
+    'line': 444,
+}
+
+B1 = {
+    'date': datetime.date(2022, 1, 1),
+    'amount': decimal.Decimal('10.00'),
+    'payee': 'Patreon',
+    'entity': '',
+    'check_id': None,
+    'filename': '2022/imports.beancount',
+    'line': 777,
+    'bank_statement': "Financial/Bank-Statements/AMEX/2022-01-12_AMEX_statement.pdf"
+}
+B2 = {
+    'date': datetime.date(2022, 1, 2),
+    'amount': decimal.Decimal('20.00'),
+    'payee': 'Linode',
+    'entity': '',
+    'check_id': None,
+    'filename': '2022/main.beancount',
+    'line': 888,
+    'bank_statement': "Financial/Bank-Statements/AMEX/2022-01-12_AMEX_statement.pdf"
+}
+B3_next_day = {
+    'date': datetime.date(2022, 1, 4),
+    'amount': decimal.Decimal('30.00'),
+    'payee': 'USPS',
+    'entity': '',
+    'check_id': None,
+    'filename': '2022/main.beancount',
+    'line': 999,
+    'bank_statement': "Financial/Bank-Statements/AMEX/2022-01-12_AMEX_statement.pdf"
+}
+B3_next_week = {
+    'date': datetime.date(2022, 1, 10),
+    'amount': decimal.Decimal('30.00'),
+    'payee': 'USPS',
+    'entity': '',
+    'check_id': None,
+    'filename': '2022/main.beancount',
+    'line': 999,
+    'bank_statement': "Financial/Bank-Statements/AMEX/2022-01-12_AMEX_statement.pdf"
+}
+B3_mismatch_amount = {
+    'date': datetime.date(2022, 1, 3),
+    'amount': decimal.Decimal('31.00'),
+    'payee': 'USPS',
+    'entity': '',
+    'check_id': None,
+    'filename': '2022/main.beancount',
+    'line': 999,
+    'bank_statement': "Financial/Bank-Statements/AMEX/2022-01-12_AMEX_statement.pdf"
+}
+B3_payee_mismatch_1 = {
+    'date': datetime.date(2022, 1, 3),
+    'amount': decimal.Decimal('30.00'),
+    'payee': 'Credit X',
+    'entity': '',
+    'check_id': None,
+    'filename': '2022/main.beancount',
+    'line': 999,
+    'bank_statement': "Financial/Bank-Statements/AMEX/2022-01-12_AMEX_statement.pdf"
+}
+B3_payee_mismatch_2 = {
+    'date': datetime.date(2022, 1, 3),
+    'amount': decimal.Decimal('30.00'),
+    'payee': 'Credit Y',
+    'entity': '',
+    'check_id': None,
+    'filename': '2022/main.beancount',
+    'line': 999,
+    'bank_statement': "Financial/Bank-Statements/AMEX/2022-01-12_AMEX_statement.pdf"
+}
+
+
+def test_one_exact_match():
+    statement = [S1]
+    books = [B1]
+    assert match_statement_and_books(statement, books) == [
+        ([S1], [B1], []),
+    ]
+
+def test_multiple_exact_matches():
+    statement = [S1, S2]
+    books = [B1, B2]
+    assert match_statement_and_books(statement, books) == [
+        ([S1], [B1], []),
+        ([S2], [B2], []),
+    ]
+
+def test_one_mismatch():
+    statement = [S1]
+    books = []
+    assert match_statement_and_books(statement, books) == [
+        ([S1], [], ['no match']),
+    ]
+
+def test_multiple_mismatches():
+    statement = [S1]
+    books = [B2]
+    assert match_statement_and_books(statement, books) == [
+        ([S1], [], ['no match']),
+        ([], [B2], ['no match']),
+    ]
+
+def test_next_day_matches():
+    statement = [S3]
+    books = [B3_next_day]
+    assert match_statement_and_books(statement, books) == [
+        ([S3], [B3_next_day], ['+/- 1 days']),
+    ]
+
+def test_next_week_matches():
+    statement = [S3]
+    books = [B3_next_week]
+    assert match_statement_and_books(statement, books) == [
+        ([S3], [B3_next_week], ['+/- 7 days']),
+    ]
+
+def test_incorrect_amount_does_not_match():
+    statement = [S3]
+    books = [B3_mismatch_amount]
+    assert match_statement_and_books(statement, books) == [
+        ([S3], [], ['no match']),
+        ([], [B3_mismatch_amount], ['no match']),
+    ]
+
+def test_payee_mismatch_ok_when_only_one_that_amount_and_date():
+    statement = [S3]
+    books = [B3_payee_mismatch_1]
+    assert match_statement_and_books(statement, books) == [
+        ([S3], [B3_payee_mismatch_1], ['payee mismatch', 'only one decent match']),
+    ]
+
+def test_payee_mismatch_not_ok_when_multiple_that_amount_and_date():
+    statement = [S3]
+    books = [B3_payee_mismatch_1, B3_payee_mismatch_2]
+    match = match_statement_and_books(statement, books)
+    assert match == [
+        ([S3], [], ['no match']),
+        ([], [B3_payee_mismatch_1], ['no match']),
+        ([], [B3_payee_mismatch_2], ['no match']),
+    ]
+
+# def test_subset_sum_with_same_date_and_payee():
+
+def test_remove_payee_junk():
+    assert remove_payee_junk('WIDGETSRUS INC PAYMENT 1') == 'WIDGETSRUS'
+    assert remove_payee_junk('0000010017') == '10017'
+
+def test_date_proximity():
+    assert date_proximity(datetime.date(2021, 8, 23), datetime.date(2021, 8, 23)) == 1.0
+    assert date_proximity(datetime.date(2021, 8, 23), datetime.date(2021, 8, 23) - datetime.timedelta(days=30)) == 0.5
+    assert date_proximity(datetime.date(2021, 8, 23), datetime.date(2021, 8, 23) - datetime.timedelta(days=60)) == 0.0