From 32fc4517a0540e60103d56e6454029c1dbe9e517 Mon Sep 17 00:00:00 2001
From: Ben Sturmfels <ben@sturm.com.au>
Date: Sat, 19 Feb 2022 00:27:48 +1100
Subject: [PATCH] reconcile: show check number, remove duplicate words in
 payee.

---
 .../reconcile/prototype_amex_reconciler.py    | 211 ++++++++++--------
 tests/test_reconcile.py                       |  35 +--
 2 files changed, 134 insertions(+), 112 deletions(-)

diff --git a/conservancy_beancount/reconcile/prototype_amex_reconciler.py b/conservancy_beancount/reconcile/prototype_amex_reconciler.py
index 487d068..149c785 100644
--- a/conservancy_beancount/reconcile/prototype_amex_reconciler.py
+++ b/conservancy_beancount/reconcile/prototype_amex_reconciler.py
@@ -88,11 +88,22 @@ JUNK_WORDS = [
     'llc',
     'online',
     'donation',
+    'usd',
 ]
 JUNK_WORDS_RES = [re.compile(word, re.IGNORECASE) for word in JUNK_WORDS]
 ZERO_RE = re.compile('^0+')
 
-def remove_payee_junk(payee):
+def remove_duplicate_words(text):
+    unique_words = []
+    known_words = set()
+    for word in text.split():
+        if word.lower() not in known_words:
+            unique_words.append(word)
+            known_words.add(word.lower())
+    return ' '.join(unique_words)
+
+
+def remove_payee_junk(payee: str) -> str:
     for r in JUNK_WORDS_RES:
         payee = r.sub('', payee)
     payee = ZERO_RE.sub('', payee)
@@ -103,8 +114,9 @@ def remove_payee_junk(payee):
     payee = re.sub(re.escape('.com'), ' ', payee, flags=re.IGNORECASE)
     payee = re.sub(re.escape('.net'), ' ', payee, flags=re.IGNORECASE)
     payee = payee.replace('*', ' ')
-    payee = ' '.join([i for i in payee.split(' ') if len(i) > 3])
+    payee = ' '.join([i for i in payee.split(' ') if len(i) > 2])
     payee = payee.replace('-', ' ')
+    payee = remove_duplicate_words(payee)
     payee.strip()
     return payee
 
@@ -122,7 +134,8 @@ def standardize_amex_record(row: Dict, line: int) -> Dict:
         'amount': -1 * decimal.Decimal(row['Amount']),
         # Descriptions have too much noise, so taking just the start
         # significantly assists the fuzzy matching.
-        'payee': remove_payee_junk(row['Description'] or '')[:25],
+        'payee': remove_payee_junk(row['Description'] or '')[:20],
+        'check_id': '',
         'line': line,
     }
 
@@ -131,21 +144,19 @@ def standardize_fr_record(row: Dict, line: int) -> Dict:
     return {
         'date': datetime.datetime.strptime(row['Date'], '%m/%d/%Y').date(),
         'amount': decimal.Decimal(row['Amount']),
-        'payee': remove_payee_junk(row['Detail'] or row['Serial Num'])[:20],
+        'payee': remove_payee_junk(row['Detail'] or '')[:20],
+        'check_id': row['Serial Num'].lstrip('0'),
         'line': line,
     }
 
 
 def standardize_beancount_record(row) -> Dict:  # type: ignore[no-untyped-def]
     """Turn a Beancount query result row into a standard dict representing a transaction."""
-    # if '0000000362' in row.narration:
-    #     import pdb; pdb.set_trace()
     return {
         'date': row.date,
         'amount': row.number_cost_position,
-        'payee': remove_payee_junk(row.payee or row.narration)[:30],
-        'entity': row.entity,
-        'check_id': row.check_id,
+        'payee': remove_payee_junk(f'{row.payee or ""} {row.entity or ""} {row.narration or ""}'),
+        'check_id': str(row.check_id or ''),
         'filename': row.filename,
         'line': row.line,
         'bank_statement': row.bank_statement,
@@ -155,7 +166,14 @@ def standardize_beancount_record(row) -> Dict:  # type: ignore[no-untyped-def]
 def format_record(records: list[dict]) -> str:
     if len(records) == 1:
         record = records[0]
-        return f"{record['date'].isoformat()}: {record['amount']:12,.2f} {record['payee'][:25]:<25}"
+
+        if record['payee'] and record['check_id']:
+            output = f"{record['date'].isoformat()}: {record['amount']:11,.2f} {record['payee'][:25]} #{record['check_id']}".ljust(59)
+        elif record['payee']:
+            output = f"{record['date'].isoformat()}: {record['amount']:11,.2f} {record['payee'][:35]}".ljust(59)
+        else:
+            output = f"{record['date'].isoformat()}: {record['amount']:11,.2f} #{record['check_id']}".ljust(59)
+        return output
     else:
         raise NotImplementedError
 
@@ -164,79 +182,7 @@ def sort_records(records: List) -> List:
     return sorted(records, key=lambda x: (x['date'], x['amount']))
 
 
-def match_statement_and_books2(statement_trans: list, books_trans: list):
-    matches = []
-    # We need a realised list and should be a copy so we can safely delete
-    # items.
-    books_trans = list(books_trans)
-
-    # run through all the statement transactions to find a matching transaction in
-    # the books. If found, the books transaction is marked off so that it can only
-    # be matched once. Some transactions will be matched, some will be on the
-    # statement but not the books and some on the books but not the statement.
-    #
-    # Doesn't currently find exact matches when there are a bunch of transactions
-    # for the same amount on the same day. Probably ought to do a first pass through
-    # the books transactions to find an closely matching payee, then do another pass
-    # disregarding payee.
-
-    for r1 in statement_trans:
-        best_match_score = 0
-        best_match_index = None
-        best_match_note = ''
-        matches_found = 0
-        for i, r2 in enumerate(books_trans):
-            score, note = records_match2(r1, r2)
-            if score >= 0.5 and score >= best_match_score:
-                matches_found += 1
-                best_match_score = score
-                best_match_index = i
-                best_match_note = note
-                # if note == 'payee mismatch':
-                #     payee_only_mismatches += 1
-        if best_match_score > 0.5 and matches_found == 1 and 'payee_mismatch' not in best_match_note or best_match_score > 0.8:
-            if best_match_score <= 0.8:
-                best_match_note.append('only one decent match')
-            matches.append(([r1], [books_trans[best_match_index]], best_match_note))
-            del books_trans[best_match_index]
-        else:
-            matches.append(([r1], [], ['no match']))
-    for r2 in books_trans:
-        matches.append(([], [r2], ['no match']))
-    return matches
-
-def format_matches(statement_trans, books_trans, show_reconciled_matches: bool, csv_statement: str):
-    match_output = []
-    metadata_to_apply = []
-    total_matched = decimal.Decimal(0)
-    total_missing_from_books = decimal.Decimal(0)
-    total_missing_from_statement = decimal.Decimal(0)
-
-    matches = match_statement_and_books2(statement_trans, books_trans)
-    for r1, r2, note in matches:
-        note = ', '.join(note)
-        note = ': ' + note if note else note
-        if r1 and r2:
-            match_output.append([r1[0]['date'], f'{format_record(r1)}  →  {format_record(r2)}  ✓ Matched{note}'])
-        elif r1:
-            match_output.append([r1[0]['date'], f'{format_record(r1)}  →  {" ":^50}  ✗ Not in books ({os.path.basename(csv_statement)}:{r1[0]["line"]})'])
-        else:
-            match_output.append([r2[0]['date'], f'{" ":^50}  →  {format_record(r2)}  ✗ Not on statement ({os.path.basename(r2[0]["filename"])}:{r2[0]["line"]})'])
-    return match_output, metadata_to_apply, total_matched, total_missing_from_books, total_missing_from_statement
-
-
-# TODO: Could potentially return a score so that we can find the best match from
-# a pool of candidates. How would be then remove that candidate from the global
-# pool?
-
-def date_proximity(d1, d2):
-    diff = abs((d1 - d2).days)
-    if diff > 60:
-        return 0
-    else:
-        return 1.0 - (diff / 60.0)
-
-def records_match2(r1: Dict, r2: Dict) -> Tuple[bool, str]:
+def records_match(r1: Dict, r2: Dict) -> Tuple[bool, str]:
     """Do these records represent the same transaction?"""
 
     date_score = date_proximity(r1['date'], r2['date'])
@@ -253,25 +199,96 @@ def records_match2(r1: Dict, r2: Dict) -> Tuple[bool, str]:
     else:
         amount_score, amount_message = 0.0, 'amount mismatch'
 
-    if r2['check_id']:
-        payee_score = 1.0 if r1['payee'] == str(r2['check_id']) else 0
-        payee_message = 'check # matched'
+    if r1['check_id'] and r2['check_id'] and r1['check_id'] == r2['check_id']:
+        check_score = 1.0
     else:
-        payee_match_quality_payee = fuzz.token_set_ratio(r1['payee'], r2['payee'])
-        payee_match_quality_entity = fuzz.token_set_ratio(r1['payee'], r2['entity'])
-        payee_score = max(payee_match_quality_payee, payee_match_quality_entity) / 100.0
-        if payee_score > 0.8:
-            payee_message = ''
-        elif payee_score > 0.5:
-            payee_message = 'partial payee match'
-        else:
-            payee_message = 'payee mismatch'
+        check_score = 0.0
 
-    overall_score = (date_score + amount_score + payee_score) / 4
+    payee_score = fuzz.token_set_ratio(r1['payee'], r2['payee']) / 100.00
+    if check_score == 1.0 or payee_score > 0.8:
+        payee_message = ''
+    elif payee_score > 0.4:
+        payee_message = 'partial payee match'
+    else:
+        payee_message = 'payee mismatch'
+
+    overall_score = (date_score + amount_score + check_score + payee_score) / 4
     overall_message = [m for m in [date_message, amount_message, payee_message] if m]
     return overall_score, overall_message
 
 
+def match_statement_and_books(statement_trans: list, books_trans: list):
+    """
+
+
+    Runs through all the statement transactions to find a matching transaction
+    in the books. If found, the books transaction is marked off so that it can
+    only be matched once. Some transactions will be matched, some will be on the
+    statement but not the books and some on the books but not the statement.
+
+    """
+    matches = []
+    # We need a realised list and should be a copy so we can safely delete
+    # items.
+    books_trans = list(books_trans)
+
+    for r1 in statement_trans:
+        best_match_score = 0
+        best_match_index = None
+        best_match_note = ''
+        matches_found = 0
+        for i, r2 in enumerate(books_trans):
+            score, note = records_match(r1, r2)
+            if score >= 0.5 and score >= best_match_score:
+                matches_found += 1
+                best_match_score = score
+                best_match_index = i
+                best_match_note = note
+        if best_match_score > 0.5 and matches_found == 1 and 'payee_mismatch' not in best_match_note or best_match_score > 0.8:
+            if best_match_score <= 0.8:
+                best_match_note.append('only one decent match')
+            matches.append(([r1], [books_trans[best_match_index]], best_match_note))
+            del books_trans[best_match_index]
+        else:
+            matches.append(([r1], [], ['no match']))
+    for r2 in books_trans:
+        matches.append(([], [r2], ['no match']))
+    return matches
+
+
+def format_matches(statement_trans, books_trans, show_reconciled_matches: bool, csv_statement: str):
+    match_output = []
+    metadata_to_apply = []
+    total_matched = decimal.Decimal(0)
+    total_missing_from_books = decimal.Decimal(0)
+    total_missing_from_statement = decimal.Decimal(0)
+
+    matches = match_statement_and_books(statement_trans, books_trans)
+    for r1, r2, note in matches:
+        note = ', '.join(note)
+        note = ': ' + note if note else note
+        if r1 and r2:
+            match_output.append([r1[0]['date'], f'{format_record(r1)}  →  {format_record(r2)}  ✓ Matched{note}'])
+        elif r1:
+            match_output.append([r1[0]['date'], f'{format_record(r1)}  →  {" ":^59}  ✗ NOT IN BOOKS ({os.path.basename(csv_statement)}:{r1[0]["line"]})'])
+        else:
+            match_output.append([r2[0]['date'], f'{" ":^59}  →  {format_record(r2)}  ✗ NOT ON STATEMENT ({os.path.basename(r2[0]["filename"])}:{r2[0]["line"]})'])
+    return match_output, metadata_to_apply, total_matched, total_missing_from_books, total_missing_from_statement
+
+
+# TODO: Could potentially return a score so that we can find the best match from
+# a pool of candidates. How would be then remove that candidate from the global
+# pool?
+
+def date_proximity(d1, d2):
+    diff = abs((d1 - d2).days)
+    if diff > 60:
+        return 0
+    else:
+        return 1.0 - (diff / 60.0)
+
+
+
 # TODO: Is there a way to pull the side-effecting code out of this function?
 
 def write_metadata_to_books(metadata_to_apply: List[Tuple[str, int, str]]) -> None:
@@ -358,7 +375,7 @@ def main(args):
 
     # String concatenation looks bad, but there's no SQL injection possible here
     # because BQL can't write back to the Beancount files. I hope!
-    query = f"SELECT id, filename, META('lineno') AS line, META('bank-statement') AS bank_statement, date, number(cost(position)), payee, ANY_META('entity') as entity, ANY_META('check-id') as check_id, narration where account = '{args.account}' and date >= {begin_date} and date <= {end_date}"
+    query = f'SELECT filename, META("lineno") AS line, META("bank-statement") AS bank_statement, date, number(cost(position)), payee, ENTRY_META("entity") as entity, ANY_META("check-id") as check_id, narration where account = "{args.account}" and date >= {begin_date} and date <= {end_date}'
     result_types, result_rows = run_query(entries, options, query)
 
     books_trans = sort_records([standardize_beancount_record(row) for row in result_rows])
diff --git a/tests/test_reconcile.py b/tests/test_reconcile.py
index 5f75d3a..ea9fe8e 100644
--- a/tests/test_reconcile.py
+++ b/tests/test_reconcile.py
@@ -1,24 +1,32 @@
 import datetime
 import decimal
 
-from conservancy_beancount.reconcile.prototype_amex_reconciler import match_statement_and_books2 as match_statement_and_books, remove_payee_junk, date_proximity
+from conservancy_beancount.reconcile.prototype_amex_reconciler import (
+    match_statement_and_books,
+    remove_payee_junk,
+    date_proximity,
+    remove_duplicate_words,
+)
 
 S1 = {
     'date': datetime.date(2022, 1, 1),
     'amount': decimal.Decimal('10.00'),
     'payee': 'Patreon         / Patreon   / 123456/ ST-A1B2C3D4G5H6       /',
+    'check_id': '',
     'line': 222,
 }
 S2 = {
     'date': datetime.date(2022, 1, 2),
     'amount': decimal.Decimal('20.00'),
     'payee': 'BT*LINODE           PHILADELPHIA        P',
+    'check_id': '',
     'line': 333,
 }
 S3 = {
     'date': datetime.date(2022, 1, 3),
     'amount': decimal.Decimal('30.00'),
     'payee': 'USPS PO 4067540039 0PORTLAND            OR',
+    'check_id': '',
     'line': 444,
 }
 
@@ -26,8 +34,7 @@ B1 = {
     'date': datetime.date(2022, 1, 1),
     'amount': decimal.Decimal('10.00'),
     'payee': 'Patreon',
-    'entity': '',
-    'check_id': None,
+    'check_id': '',
     'filename': '2022/imports.beancount',
     'line': 777,
     'bank_statement': "Financial/Bank-Statements/AMEX/2022-01-12_AMEX_statement.pdf"
@@ -36,8 +43,7 @@ B2 = {
     'date': datetime.date(2022, 1, 2),
     'amount': decimal.Decimal('20.00'),
     'payee': 'Linode',
-    'entity': '',
-    'check_id': None,
+    'check_id': '',
     'filename': '2022/main.beancount',
     'line': 888,
     'bank_statement': "Financial/Bank-Statements/AMEX/2022-01-12_AMEX_statement.pdf"
@@ -46,8 +52,7 @@ B3_next_day = {
     'date': datetime.date(2022, 1, 4),
     'amount': decimal.Decimal('30.00'),
     'payee': 'USPS',
-    'entity': '',
-    'check_id': None,
+    'check_id': '',
     'filename': '2022/main.beancount',
     'line': 999,
     'bank_statement': "Financial/Bank-Statements/AMEX/2022-01-12_AMEX_statement.pdf"
@@ -56,8 +61,7 @@ B3_next_week = {
     'date': datetime.date(2022, 1, 10),
     'amount': decimal.Decimal('30.00'),
     'payee': 'USPS',
-    'entity': '',
-    'check_id': None,
+    'check_id': '',
     'filename': '2022/main.beancount',
     'line': 999,
     'bank_statement': "Financial/Bank-Statements/AMEX/2022-01-12_AMEX_statement.pdf"
@@ -66,8 +70,7 @@ B3_mismatch_amount = {
     'date': datetime.date(2022, 1, 3),
     'amount': decimal.Decimal('31.00'),
     'payee': 'USPS',
-    'entity': '',
-    'check_id': None,
+    'check_id': '',
     'filename': '2022/main.beancount',
     'line': 999,
     'bank_statement': "Financial/Bank-Statements/AMEX/2022-01-12_AMEX_statement.pdf"
@@ -76,8 +79,7 @@ B3_payee_mismatch_1 = {
     'date': datetime.date(2022, 1, 3),
     'amount': decimal.Decimal('30.00'),
     'payee': 'Credit X',
-    'entity': '',
-    'check_id': None,
+    'check_id': '',
     'filename': '2022/main.beancount',
     'line': 999,
     'bank_statement': "Financial/Bank-Statements/AMEX/2022-01-12_AMEX_statement.pdf"
@@ -86,8 +88,7 @@ B3_payee_mismatch_2 = {
     'date': datetime.date(2022, 1, 3),
     'amount': decimal.Decimal('30.00'),
     'payee': 'Credit Y',
-    'entity': '',
-    'check_id': None,
+    'check_id': '',
     'filename': '2022/main.beancount',
     'line': 999,
     'bank_statement': "Financial/Bank-Statements/AMEX/2022-01-12_AMEX_statement.pdf"
@@ -173,3 +174,7 @@ def test_date_proximity():
     assert date_proximity(datetime.date(2021, 8, 23), datetime.date(2021, 8, 23)) == 1.0
     assert date_proximity(datetime.date(2021, 8, 23), datetime.date(2021, 8, 23) - datetime.timedelta(days=30)) == 0.5
     assert date_proximity(datetime.date(2021, 8, 23), datetime.date(2021, 8, 23) - datetime.timedelta(days=60)) == 0.0
+
+
+def test_remove_duplicate_words():
+    assert remove_duplicate_words('Hi Foo Kow FOO') == 'Hi Foo Kow'