From 32fc4517a0540e60103d56e6454029c1dbe9e517 Mon Sep 17 00:00:00 2001 From: Ben Sturmfels Date: Sat, 19 Feb 2022 00:27:48 +1100 Subject: [PATCH] reconcile: show check number, remove duplicate words in payee. --- .../reconcile/prototype_amex_reconciler.py | 211 ++++++++++-------- tests/test_reconcile.py | 35 +-- 2 files changed, 134 insertions(+), 112 deletions(-) diff --git a/conservancy_beancount/reconcile/prototype_amex_reconciler.py b/conservancy_beancount/reconcile/prototype_amex_reconciler.py index 487d068..149c785 100644 --- a/conservancy_beancount/reconcile/prototype_amex_reconciler.py +++ b/conservancy_beancount/reconcile/prototype_amex_reconciler.py @@ -88,11 +88,22 @@ JUNK_WORDS = [ 'llc', 'online', 'donation', + 'usd', ] JUNK_WORDS_RES = [re.compile(word, re.IGNORECASE) for word in JUNK_WORDS] ZERO_RE = re.compile('^0+') -def remove_payee_junk(payee): +def remove_duplicate_words(text): + unique_words = [] + known_words = set() + for word in text.split(): + if word.lower() not in known_words: + unique_words.append(word) + known_words.add(word.lower()) + return ' '.join(unique_words) + + +def remove_payee_junk(payee: str) -> str: for r in JUNK_WORDS_RES: payee = r.sub('', payee) payee = ZERO_RE.sub('', payee) @@ -103,8 +114,9 @@ def remove_payee_junk(payee): payee = re.sub(re.escape('.com'), ' ', payee, flags=re.IGNORECASE) payee = re.sub(re.escape('.net'), ' ', payee, flags=re.IGNORECASE) payee = payee.replace('*', ' ') - payee = ' '.join([i for i in payee.split(' ') if len(i) > 3]) + payee = ' '.join([i for i in payee.split(' ') if len(i) > 2]) payee = payee.replace('-', ' ') + payee = remove_duplicate_words(payee) payee.strip() return payee @@ -122,7 +134,8 @@ def standardize_amex_record(row: Dict, line: int) -> Dict: 'amount': -1 * decimal.Decimal(row['Amount']), # Descriptions have too much noise, so taking just the start # significantly assists the fuzzy matching. - 'payee': remove_payee_junk(row['Description'] or '')[:25], + 'payee': remove_payee_junk(row['Description'] or '')[:20], + 'check_id': '', 'line': line, } @@ -131,21 +144,19 @@ def standardize_fr_record(row: Dict, line: int) -> Dict: return { 'date': datetime.datetime.strptime(row['Date'], '%m/%d/%Y').date(), 'amount': decimal.Decimal(row['Amount']), - 'payee': remove_payee_junk(row['Detail'] or row['Serial Num'])[:20], + 'payee': remove_payee_junk(row['Detail'] or '')[:20], + 'check_id': row['Serial Num'].lstrip('0'), 'line': line, } def standardize_beancount_record(row) -> Dict: # type: ignore[no-untyped-def] """Turn a Beancount query result row into a standard dict representing a transaction.""" - # if '0000000362' in row.narration: - # import pdb; pdb.set_trace() return { 'date': row.date, 'amount': row.number_cost_position, - 'payee': remove_payee_junk(row.payee or row.narration)[:30], - 'entity': row.entity, - 'check_id': row.check_id, + 'payee': remove_payee_junk(f'{row.payee or ""} {row.entity or ""} {row.narration or ""}'), + 'check_id': str(row.check_id or ''), 'filename': row.filename, 'line': row.line, 'bank_statement': row.bank_statement, @@ -155,7 +166,14 @@ def standardize_beancount_record(row) -> Dict: # type: ignore[no-untyped-def] def format_record(records: list[dict]) -> str: if len(records) == 1: record = records[0] - return f"{record['date'].isoformat()}: {record['amount']:12,.2f} {record['payee'][:25]:<25}" + + if record['payee'] and record['check_id']: + output = f"{record['date'].isoformat()}: {record['amount']:11,.2f} {record['payee'][:25]} #{record['check_id']}".ljust(59) + elif record['payee']: + output = f"{record['date'].isoformat()}: {record['amount']:11,.2f} {record['payee'][:35]}".ljust(59) + else: + output = f"{record['date'].isoformat()}: {record['amount']:11,.2f} #{record['check_id']}".ljust(59) + return output else: raise NotImplementedError @@ -164,79 +182,7 @@ def sort_records(records: List) -> List: return sorted(records, key=lambda x: (x['date'], x['amount'])) -def match_statement_and_books2(statement_trans: list, books_trans: list): - matches = [] - # We need a realised list and should be a copy so we can safely delete - # items. - books_trans = list(books_trans) - - # run through all the statement transactions to find a matching transaction in - # the books. If found, the books transaction is marked off so that it can only - # be matched once. Some transactions will be matched, some will be on the - # statement but not the books and some on the books but not the statement. - # - # Doesn't currently find exact matches when there are a bunch of transactions - # for the same amount on the same day. Probably ought to do a first pass through - # the books transactions to find an closely matching payee, then do another pass - # disregarding payee. - - for r1 in statement_trans: - best_match_score = 0 - best_match_index = None - best_match_note = '' - matches_found = 0 - for i, r2 in enumerate(books_trans): - score, note = records_match2(r1, r2) - if score >= 0.5 and score >= best_match_score: - matches_found += 1 - best_match_score = score - best_match_index = i - best_match_note = note - # if note == 'payee mismatch': - # payee_only_mismatches += 1 - if best_match_score > 0.5 and matches_found == 1 and 'payee_mismatch' not in best_match_note or best_match_score > 0.8: - if best_match_score <= 0.8: - best_match_note.append('only one decent match') - matches.append(([r1], [books_trans[best_match_index]], best_match_note)) - del books_trans[best_match_index] - else: - matches.append(([r1], [], ['no match'])) - for r2 in books_trans: - matches.append(([], [r2], ['no match'])) - return matches - -def format_matches(statement_trans, books_trans, show_reconciled_matches: bool, csv_statement: str): - match_output = [] - metadata_to_apply = [] - total_matched = decimal.Decimal(0) - total_missing_from_books = decimal.Decimal(0) - total_missing_from_statement = decimal.Decimal(0) - - matches = match_statement_and_books2(statement_trans, books_trans) - for r1, r2, note in matches: - note = ', '.join(note) - note = ': ' + note if note else note - if r1 and r2: - match_output.append([r1[0]['date'], f'{format_record(r1)} → {format_record(r2)} ✓ Matched{note}']) - elif r1: - match_output.append([r1[0]['date'], f'{format_record(r1)} → {" ":^50} ✗ Not in books ({os.path.basename(csv_statement)}:{r1[0]["line"]})']) - else: - match_output.append([r2[0]['date'], f'{" ":^50} → {format_record(r2)} ✗ Not on statement ({os.path.basename(r2[0]["filename"])}:{r2[0]["line"]})']) - return match_output, metadata_to_apply, total_matched, total_missing_from_books, total_missing_from_statement - - -# TODO: Could potentially return a score so that we can find the best match from -# a pool of candidates. How would be then remove that candidate from the global -# pool? - -def date_proximity(d1, d2): - diff = abs((d1 - d2).days) - if diff > 60: - return 0 - else: - return 1.0 - (diff / 60.0) - -def records_match2(r1: Dict, r2: Dict) -> Tuple[bool, str]: +def records_match(r1: Dict, r2: Dict) -> Tuple[bool, str]: """Do these records represent the same transaction?""" date_score = date_proximity(r1['date'], r2['date']) @@ -253,25 +199,96 @@ def records_match2(r1: Dict, r2: Dict) -> Tuple[bool, str]: else: amount_score, amount_message = 0.0, 'amount mismatch' - if r2['check_id']: - payee_score = 1.0 if r1['payee'] == str(r2['check_id']) else 0 - payee_message = 'check # matched' + if r1['check_id'] and r2['check_id'] and r1['check_id'] == r2['check_id']: + check_score = 1.0 else: - payee_match_quality_payee = fuzz.token_set_ratio(r1['payee'], r2['payee']) - payee_match_quality_entity = fuzz.token_set_ratio(r1['payee'], r2['entity']) - payee_score = max(payee_match_quality_payee, payee_match_quality_entity) / 100.0 - if payee_score > 0.8: - payee_message = '' - elif payee_score > 0.5: - payee_message = 'partial payee match' - else: - payee_message = 'payee mismatch' + check_score = 0.0 - overall_score = (date_score + amount_score + payee_score) / 4 + payee_score = fuzz.token_set_ratio(r1['payee'], r2['payee']) / 100.00 + if check_score == 1.0 or payee_score > 0.8: + payee_message = '' + elif payee_score > 0.4: + payee_message = 'partial payee match' + else: + payee_message = 'payee mismatch' + + overall_score = (date_score + amount_score + check_score + payee_score) / 4 overall_message = [m for m in [date_message, amount_message, payee_message] if m] return overall_score, overall_message +def match_statement_and_books(statement_trans: list, books_trans: list): + """ + + + Runs through all the statement transactions to find a matching transaction + in the books. If found, the books transaction is marked off so that it can + only be matched once. Some transactions will be matched, some will be on the + statement but not the books and some on the books but not the statement. + + """ + matches = [] + # We need a realised list and should be a copy so we can safely delete + # items. + books_trans = list(books_trans) + + for r1 in statement_trans: + best_match_score = 0 + best_match_index = None + best_match_note = '' + matches_found = 0 + for i, r2 in enumerate(books_trans): + score, note = records_match(r1, r2) + if score >= 0.5 and score >= best_match_score: + matches_found += 1 + best_match_score = score + best_match_index = i + best_match_note = note + if best_match_score > 0.5 and matches_found == 1 and 'payee_mismatch' not in best_match_note or best_match_score > 0.8: + if best_match_score <= 0.8: + best_match_note.append('only one decent match') + matches.append(([r1], [books_trans[best_match_index]], best_match_note)) + del books_trans[best_match_index] + else: + matches.append(([r1], [], ['no match'])) + for r2 in books_trans: + matches.append(([], [r2], ['no match'])) + return matches + + +def format_matches(statement_trans, books_trans, show_reconciled_matches: bool, csv_statement: str): + match_output = [] + metadata_to_apply = [] + total_matched = decimal.Decimal(0) + total_missing_from_books = decimal.Decimal(0) + total_missing_from_statement = decimal.Decimal(0) + + matches = match_statement_and_books(statement_trans, books_trans) + for r1, r2, note in matches: + note = ', '.join(note) + note = ': ' + note if note else note + if r1 and r2: + match_output.append([r1[0]['date'], f'{format_record(r1)} → {format_record(r2)} ✓ Matched{note}']) + elif r1: + match_output.append([r1[0]['date'], f'{format_record(r1)} → {" ":^59} ✗ NOT IN BOOKS ({os.path.basename(csv_statement)}:{r1[0]["line"]})']) + else: + match_output.append([r2[0]['date'], f'{" ":^59} → {format_record(r2)} ✗ NOT ON STATEMENT ({os.path.basename(r2[0]["filename"])}:{r2[0]["line"]})']) + return match_output, metadata_to_apply, total_matched, total_missing_from_books, total_missing_from_statement + + +# TODO: Could potentially return a score so that we can find the best match from +# a pool of candidates. How would be then remove that candidate from the global +# pool? + +def date_proximity(d1, d2): + diff = abs((d1 - d2).days) + if diff > 60: + return 0 + else: + return 1.0 - (diff / 60.0) + + + # TODO: Is there a way to pull the side-effecting code out of this function? def write_metadata_to_books(metadata_to_apply: List[Tuple[str, int, str]]) -> None: @@ -358,7 +375,7 @@ def main(args): # String concatenation looks bad, but there's no SQL injection possible here # because BQL can't write back to the Beancount files. I hope! - query = f"SELECT id, filename, META('lineno') AS line, META('bank-statement') AS bank_statement, date, number(cost(position)), payee, ANY_META('entity') as entity, ANY_META('check-id') as check_id, narration where account = '{args.account}' and date >= {begin_date} and date <= {end_date}" + query = f'SELECT filename, META("lineno") AS line, META("bank-statement") AS bank_statement, date, number(cost(position)), payee, ENTRY_META("entity") as entity, ANY_META("check-id") as check_id, narration where account = "{args.account}" and date >= {begin_date} and date <= {end_date}' result_types, result_rows = run_query(entries, options, query) books_trans = sort_records([standardize_beancount_record(row) for row in result_rows]) diff --git a/tests/test_reconcile.py b/tests/test_reconcile.py index 5f75d3a..ea9fe8e 100644 --- a/tests/test_reconcile.py +++ b/tests/test_reconcile.py @@ -1,24 +1,32 @@ import datetime import decimal -from conservancy_beancount.reconcile.prototype_amex_reconciler import match_statement_and_books2 as match_statement_and_books, remove_payee_junk, date_proximity +from conservancy_beancount.reconcile.prototype_amex_reconciler import ( + match_statement_and_books, + remove_payee_junk, + date_proximity, + remove_duplicate_words, +) S1 = { 'date': datetime.date(2022, 1, 1), 'amount': decimal.Decimal('10.00'), 'payee': 'Patreon / Patreon / 123456/ ST-A1B2C3D4G5H6 /', + 'check_id': '', 'line': 222, } S2 = { 'date': datetime.date(2022, 1, 2), 'amount': decimal.Decimal('20.00'), 'payee': 'BT*LINODE PHILADELPHIA P', + 'check_id': '', 'line': 333, } S3 = { 'date': datetime.date(2022, 1, 3), 'amount': decimal.Decimal('30.00'), 'payee': 'USPS PO 4067540039 0PORTLAND OR', + 'check_id': '', 'line': 444, } @@ -26,8 +34,7 @@ B1 = { 'date': datetime.date(2022, 1, 1), 'amount': decimal.Decimal('10.00'), 'payee': 'Patreon', - 'entity': '', - 'check_id': None, + 'check_id': '', 'filename': '2022/imports.beancount', 'line': 777, 'bank_statement': "Financial/Bank-Statements/AMEX/2022-01-12_AMEX_statement.pdf" @@ -36,8 +43,7 @@ B2 = { 'date': datetime.date(2022, 1, 2), 'amount': decimal.Decimal('20.00'), 'payee': 'Linode', - 'entity': '', - 'check_id': None, + 'check_id': '', 'filename': '2022/main.beancount', 'line': 888, 'bank_statement': "Financial/Bank-Statements/AMEX/2022-01-12_AMEX_statement.pdf" @@ -46,8 +52,7 @@ B3_next_day = { 'date': datetime.date(2022, 1, 4), 'amount': decimal.Decimal('30.00'), 'payee': 'USPS', - 'entity': '', - 'check_id': None, + 'check_id': '', 'filename': '2022/main.beancount', 'line': 999, 'bank_statement': "Financial/Bank-Statements/AMEX/2022-01-12_AMEX_statement.pdf" @@ -56,8 +61,7 @@ B3_next_week = { 'date': datetime.date(2022, 1, 10), 'amount': decimal.Decimal('30.00'), 'payee': 'USPS', - 'entity': '', - 'check_id': None, + 'check_id': '', 'filename': '2022/main.beancount', 'line': 999, 'bank_statement': "Financial/Bank-Statements/AMEX/2022-01-12_AMEX_statement.pdf" @@ -66,8 +70,7 @@ B3_mismatch_amount = { 'date': datetime.date(2022, 1, 3), 'amount': decimal.Decimal('31.00'), 'payee': 'USPS', - 'entity': '', - 'check_id': None, + 'check_id': '', 'filename': '2022/main.beancount', 'line': 999, 'bank_statement': "Financial/Bank-Statements/AMEX/2022-01-12_AMEX_statement.pdf" @@ -76,8 +79,7 @@ B3_payee_mismatch_1 = { 'date': datetime.date(2022, 1, 3), 'amount': decimal.Decimal('30.00'), 'payee': 'Credit X', - 'entity': '', - 'check_id': None, + 'check_id': '', 'filename': '2022/main.beancount', 'line': 999, 'bank_statement': "Financial/Bank-Statements/AMEX/2022-01-12_AMEX_statement.pdf" @@ -86,8 +88,7 @@ B3_payee_mismatch_2 = { 'date': datetime.date(2022, 1, 3), 'amount': decimal.Decimal('30.00'), 'payee': 'Credit Y', - 'entity': '', - 'check_id': None, + 'check_id': '', 'filename': '2022/main.beancount', 'line': 999, 'bank_statement': "Financial/Bank-Statements/AMEX/2022-01-12_AMEX_statement.pdf" @@ -173,3 +174,7 @@ def test_date_proximity(): assert date_proximity(datetime.date(2021, 8, 23), datetime.date(2021, 8, 23)) == 1.0 assert date_proximity(datetime.date(2021, 8, 23), datetime.date(2021, 8, 23) - datetime.timedelta(days=30)) == 0.5 assert date_proximity(datetime.date(2021, 8, 23), datetime.date(2021, 8, 23) - datetime.timedelta(days=60)) == 0.0 + + +def test_remove_duplicate_words(): + assert remove_duplicate_words('Hi Foo Kow FOO') == 'Hi Foo Kow'