reconcile: Move code into functions, add totals.

2022-02-10 22:02:08 +11:00 · 2022-02-10 22:02:08 +11:00 · 6d7df795cb
commit 6d7df795cb
parent 4bb6177e45
1 changed files with 182 additions and 115 deletions
--- a/conservancy_beancount/reconcile/prototype_amex_reconciler.py
+++ b/conservancy_beancount/reconcile/prototype_amex_reconciler.py
@ -1,5 +1,12 @@
 """Reconcile an AMEX CSV statement against the books and print differences.
 Beancount users often write importers to create bookkeeping entries direct from
 a bank statement or similar. That approach automates data entry and
 reconciliation in one step. In some cases though, it's useful to manually enter
 transactions and reconcile them later on. This workflow helpful in cases like
 writing a paper check when there's a time lag between committing to making a
 payment and the funds being debited. That's the workflow we're using here.
 Run like this:
 $ python3 -m pip install thefuzz
@ -36,18 +43,31 @@ TODO/ISSUES:
 """
 import argparse
 import collections
 import csv
 import datetime
 import decimal
 import io
 import os
-from typing import Dict, List, Tuple
+import sys
 from typing import Callable, Dict, List, Tuple, TextIO
 from beancount import loader
 from beancount.query.query import run_query
 if not sys.warnoptions:
    import warnings
    # Disable annoying warning from thefuzz prompting for a C extension. The
    # current pure-Python implementation isn't a bottleneck for us.
    warnings.filterwarnings('ignore', category=UserWarning, module='thefuzz.fuzz')
 from thefuzz import fuzz  # type: ignore
 # NOTE: Statement doesn't seem to give us a running balance or a final total.
 def read_transactions_from_csv(f: TextIO, standardize_statement_record: Callable) -> list:
    reader = csv.DictReader(f)
    return sort_records([standardize_statement_record(row, reader.line_num) for row in reader])
 def standardize_amex_record(row: Dict, line: int) -> Dict:
    """Turn an AMEX CSV row into a standard dict format representing a transaction."""
@ -59,6 +79,15 @@ def standardize_amex_record(row: Dict, line: int) -> Dict:
    }
 def standardize_fr_record(row: Dict, line: int) -> Dict:
    return {
        'date': datetime.datetime.strptime(row['Date'], '%m/%d/%Y').date(),
        'amount': decimal.Decimal(row['Amount']),
        'payee': row['Detail'] or row['Description'],
        'line': line,
    }
 def standardize_beancount_record(row) -> Dict:  # type: ignore[no-untyped-def]
    """Turn a Beancount query result row into a standard dict representing a transaction."""
    return {
@ -70,14 +99,6 @@ def standardize_beancount_record(row) -> Dict:  # type: ignore[no-untyped-def]
        'statement': row.posting_statement,
    }
 def standardize_fr_record(row: Dict, line: int) -> Dict:
    return {
        'date': datetime.datetime.strptime(row['Date'], '%m/%d/%Y').date(),
        'amount': decimal.Decimal(row['Amount']),
        'payee': row['Detail'] or row['Description'],
        'line': line,
    }
 def format_record(record: Dict) -> str:
    return f"{record['date'].isoformat()}: {record['amount']:12,.2f} {record['payee'][:20]:<20}"
@ -87,6 +108,49 @@ def sort_records(records: List) -> List:
    return sorted(records, key=lambda x: (x['date'], x['amount']))
 def match_statement_and_books(statement_trans: list, books_trans: list, show_reconciled_matches: bool, csv_statement: str) -> tuple[list, list, decimal.Decimal]:
    matches = []
    metadata_to_apply = []
    total_matched = decimal.Decimal(0)
    total_missing_from_books = decimal.Decimal(0)
    total_missing_from_statement = decimal.Decimal(0)
    # Run through all the statement transactions to find a matching transaction in
    # the books. If found, the books transaction is marked off so that it can only
    # be matched once. Some transactions will be matched, some will be on the
    # statement but not the books and some on the books but not the statement.
    #
    # Doesn't currently find exact matches when there are a bunch of transactions
    # for the same amount on the same day. Probably ought to do a first pass through
    # the books transactions to find an closely matching payee, then do another pass
    # disregarding payee.
    # TODO: What if th
    for r1 in statement_trans:
        for r2 in books_trans:
            match, note = records_match(r1, r2)
            if match:
                if not r2['statement'] or show_reconciled_matches:
                        matches.append([r2['date'], f'{format_record(r1)}  →  {format_record(r2)}  ✓ {note}'])
                        total_matched += r2['amount']
                if not r2['statement']:
                    metadata_to_apply.append((r2['filename'], r2['line'], f'    bank-statement: "{os.path.basename(csv_statement)}:{r2["line"]}"\n'))
                books_trans.remove(r2)
                break
        else:
            matches.append([r1['date'], f'{format_record(r1)}  →  {" ":^45}  ✗ Not in books ({os.path.basename(csv_statement)}:{r1["line"]})'])
            total_missing_from_books += r1['amount']
    for r2 in books_trans:
        matches.append([r2['date'], f'{" ":^45}  →  {format_record(r2)}  ✗ Not on statement ({os.path.basename(r2["filename"])}:{r2["line"]})'])
        total_missing_from_statement += r2['amount']
    return matches, metadata_to_apply, total_matched, total_missing_from_books, total_missing_from_statement
 # TODO: Time for some test cases I think.
 # TODO: Could potentially return a score so that we can find the best match from
 # a pool of candidates. How would be then remove that candidate from the global
 # pool?
 def records_match(r1: Dict, r2: Dict) -> Tuple[bool, str]:
    """Do these records represent the same transaction?"""
    date_matches_exactly = r1['date'] == r2['date']
@ -106,123 +170,126 @@ def records_match(r1: Dict, r2: Dict) -> Tuple[bool, str]:
        return False, ''
-parser = argparse.ArgumentParser(description='Reconciliation helper')
+# TODO: Is there a way to pull the side-effecting code out of this function?
 parser.add_argument('--beancount-file', required=True)
 parser.add_argument('--csv-statement', required=True)
 parser.add_argument('--account', required=True, help='eg. Liabilities:CreditCard:AMEX')
 parser.add_argument('--grep-output-filename')
 # parser.add_argument('--report-group-regex')
 parser.add_argument('--show-reconciled-matches', action='store_true')
 parser.add_argument('--statement-total', required=True)
 args = parser.parse_args()
-statement_total = decimal.Decimal(args.statement_total)
+def write_metadata_to_books(metadata_to_apply: List[Tuple[str, int, str]]) -> None:
    """Insert reconciliation metadata in the books files.
-# TODO: Should put in a sanity check to make sure the statement you're feeding
+    Takes a list of edits to make as tuples of form (filename, lineno, metadata):
 # in matches the account you've provided.
 if 'AMEX' in args.account:
    standardize_statement_record = standardize_amex_record
 else:
    standardize_statement_record = standardize_fr_record
-with open(args.csv_statement) as f:
+    [
-    reader = csv.DictReader(f)
+        ('2021/main.beancount', 4245, '    bank-statement: statement.pdf'),
-    statement_trans = sort_records([standardize_statement_record(row, reader.line_num) for row in reader])
+        ('2021/main.beancount', 1057, '    bank-statement: statement.pdf'),
        ('2021/payroll.beancount', 257, '    bank-statement: statement.pdf'),
        ...,
    ]
-begin_date = statement_trans[0]['date']
+    """
-end_date = statement_trans[-1]['date']
+    file_contents: dict[str, list] = {}
    file_offsets: dict[str, int] = collections.defaultdict(int)
    # Load each books file into memory and insert the relevant metadata lines.
    # Line numbers change as we do this, so we keep track of the offset for each
    # file. Changes must be sorted by line number first or else the offsets will
    # break because we're jumping around making edits.
    for filename, line, metadata in sorted(metadata_to_apply):
        if filename not in file_contents:
            with open(filename, 'r') as f:
                file_contents[filename] = f.readlines()
        # Insert is inefficient, but fast enough for now in practise.
        file_contents[filename].insert(line + file_offsets[filename], metadata)
        file_offsets[filename] += 1
    # Writes each updated file back to disk.
    for filename, contents in file_contents.items():
        with open(filename, 'w') as f:
            f.writelines(contents)
            print(f'Wrote {filename}.')
 # Do we traverse and filter the in-memory entries list and filter that, or do we
 # use Beancount Query Language (BQL) to get a list of transactions? Currently
 # using BQL.
 #
 # beancount.query.query_compile.compile() and
 # beancount.query.query_execute.filter_entries() look useful in this respect,
 # but I'm not clear on how to use compile(). An example would help.
 entries, _, options = loader.load_file(args.beancount_file)
-cleared_query = f"""SELECT sum(COST(position)) AS aa WHERE account = "{args.account}"
+def parse_args(argv):
-    AND date <= {end_date.isoformat()} AND META('bank-statement') != NULL"""
+    parser = argparse.ArgumentParser(description='Reconciliation helper')
-result_types, result_rows = run_query(
+    parser.add_argument('--beancount-file', required=True)
-    entries,
+    parser.add_argument('--csv-statement', required=True)
-    options,
+    parser.add_argument('--account', required=True, help='eg. Liabilities:CreditCard:AMEX')
-    cleared_query,
+    parser.add_argument('--grep-output-filename')
-    numberify=True,
+    # parser.add_argument('--report-group-regex')
-)
+    parser.add_argument('--show-reconciled-matches', action='store_true')
    parser.add_argument('--statement-balance', type=decimal.Decimal, required=True, help="A.K.A \"cleared balance\" taken from the end of the period on the PDF statement. Required because CSV statements don't include final or running totals")
    parser.add_argument('--non-interactive', action='store_true', help="Don't prompt to write to the books")
    return parser.parse_args(args=argv[1:])
 cleared_total = result_rows[0][0]
-# String concatenation looks bad, but there's no SQL injection possible here
+def main(args):
-# because BQL can't write back to the Beancount files. I hope!
+    # TODO: Should put in a sanity check to make sure the statement you're feeding
-query = f"SELECT filename, META('lineno') AS posting_line, META('bank-statement') AS posting_statement, date, number(cost(position)), payee, narration where account = '{args.account}' and date >= {begin_date} and date <= {end_date}"
+    # in matches the account you've provided.
-result_types, result_rows = run_query(
+    if 'AMEX' in args.account:
-    entries,
+        standardize_statement_record = standardize_amex_record
    options,
    query,
 )
 books_trans = sort_records([standardize_beancount_record(row) for row in result_rows])
 num_statement_records = len(statement_trans)
 num_books_trans = len(books_trans)
 statement_index = 0
 books_index = 0
 matches = []
 metadata_to_apply = []
 # Run through all the statement transactions to find a matching transaction in
 # the books. If found, the books transaction is marked off so that it can only
 # be matched once. Some transactions will be matched, some will be on the
 # statement but not the books and some on the books but not the statement.
 #
 # Doesn't currently find exact matches when there are a bunch of transactions
 # for the same amount on the same day. Probably ought to do a first pass through
 # the books transactions to find an closely matching payee, then do another pass
 # disregarding payee.
 for r1 in statement_trans:
    for r2 in books_trans:
        match, note = records_match(r1, r2)
        if match:
            if not r2['statement'] or args.show_reconciled_matches:
                    matches.append([r2['date'], f'{format_record(r1)}  →  {format_record(r2)}  ✓ {note}'])
            if not r2['statement']:
                metadata_to_apply.append((r2['filename'], r2['line'], f'    bank-statement: "{os.path.basename(args.csv_statement)}:{r2["line"]}"\n'))
            books_trans.remove(r2)
            break
    else:
-        matches.append([r1['date'], f'{format_record(r1)}  →  {" ":^45}  ✗ Not in books ({os.path.basename(args.csv_statement)}:{r1["line"]})'])
+        standardize_statement_record = standardize_fr_record
 for r2 in books_trans:
    matches.append([r2['date'], f'{" ":^45}  →  {format_record(r2)}  ✗ Not on statement ({os.path.basename(r2["filename"])}:{r2["line"]})'])
-print('-' * 155)
+    with open(args.csv_statement) as f:
-print(f'{"Statement transaction":<38}            {"Books transaction":<44}   Notes')
+        statement_trans = read_transactions_from_csv(f, standardize_statement_record)
 print('-' * 155)
 for _, output in sorted(matches):
    print(output)
 print('-' * 155)
 print(f'STATEMENT TOTAL: {statement_total}')
 print(f'CLEARED TOTAL:   {cleared_total:12,.2f}')
 print('-' * 155)
-# Write statement metadata back to books
+    begin_date = statement_trans[0]['date']
-if metadata_to_apply:
+    end_date = statement_trans[-1]['date']
-    print('Mark matched transactions as reconciled in the books? (y/N) ', end='')
+
-    if input().lower() == 'y':
+    # Do we traverse and filter the in-memory entries list and filter that, or do we
-        files = {}
+    # use Beancount Query Language (BQL) to get a list of transactions? Currently
-        # Query results aren't necessarily sequential in a file, so need to sort
+    # using BQL.
-        # so that our line number offsets work.
+    #
-        for filename, line, metadata in sorted(metadata_to_apply):
+    # beancount.query.query_compile.compile() and
-            if filename not in files:
+    # beancount.query.query_execute.filter_entries() look useful in this respect,
-                with open(filename, 'r') as f:
+    # but I'm not clear on how to use compile(). An example would help.
-                    # print(f'Opening {filename}.')
+    entries, _, options = loader.load_file(args.beancount_file)
-                    files[filename] = [0, f.readlines()]  # Offset and contents
+
-            files[filename][1].insert(line + files[filename][0], metadata)
+    books_balance_query = f"""SELECT sum(COST(position)) AS aa WHERE account = "{args.account}"
-            files[filename][0] += 1
+        AND date <= {end_date.isoformat()}"""
-            # print(f'File {filename} offset {files[filename][0]}')
+    result_types, result_rows = run_query(entries, options, books_balance_query, numberify=True)
-        for filename in files:
+    books_balance = result_rows[0][0] if result_rows else 0
-            with open(filename, 'w') as f:
+
-                f.writelines(files[filename][1])
+    books_balance_reconciled_query = f"""SELECT sum(COST(position)) AS aa WHERE account = "{args.account}"
-                print(f'Wrote {filename}.')
+        AND date <= {end_date.isoformat()} AND META('bank-statement') != NULL"""
    result_types, result_rows = run_query(entries, options, books_balance_reconciled_query, numberify=True)
    books_balance_reconciled = result_rows[0][0] if result_rows else 0
    # String concatenation looks bad, but there's no SQL injection possible here
    # because BQL can't write back to the Beancount files. I hope!
    query = f"SELECT filename, META('lineno') AS posting_line, META('bank-statement') AS posting_statement, date, number(cost(position)), payee, narration where account = '{args.account}' and date >= {begin_date} and date <= {end_date}"
    result_types, result_rows = run_query(entries, options, query)
    books_trans = sort_records([standardize_beancount_record(row) for row in result_rows])
    matches, metadata_to_apply, total_matched, total_missing_from_books, total_missing_from_statement = match_statement_and_books(
        statement_trans, books_trans, args.show_reconciled_matches, args.csv_statement)
    out = io.StringIO()
    print('-' * 155)
    print(f'{"Statement transaction":<38}            {"Books transaction":<44}   Notes')
    print('-' * 155)
    for _, output in sorted(matches):
        print(output)
    print('-' * 155)
    print(f'Period: {begin_date} to {end_date}')
    print(f'Statement/cleared balance:  {args.statement_balance:12,.2f}    (as provided by you)')
    print(f'Books balance (all):        {books_balance:12,.2f}    (all transactions, includes unreconciled)')
    print(f'Books balance (reconciled): {books_balance_reconciled:12,.2f}    (transactions with "bank-statement" tag only)')
    print(f'Matched above:              {total_matched:12,.2f}    ("bank-statement" tag yet to be applied)')
    print(f'On statement only:          {total_missing_from_books:12,.2f}    (no match in books)')
    print(f'On books only:              {total_missing_from_statement:12,.2f}    (no match on statement)')
    print('-' * 155)
    # print(f'Remaning to reconcile:          {books_balance - books_balance_reconciled - total_matched:12,.2f}')
    # print(f'Total reconciled inc. above:    {books_balance_reconciled + total_matched:12,.2f}')
    # print('-' * 155)
    # Write statement metadata back to books
    if metadata_to_apply and not args.non_interactive:
        print('Mark matched transactions as reconciled in the books? (y/N) ', end='')
        if input().lower() == 'y':
            write_metadata_to_books(metadata_to_apply)
 if __name__ == '__main__':
    args = parse_args(sys.argv)
    main(args)
 # Local Variables:
 # python-shell-interpreter: "/home/ben/\.virtualenvs/conservancy-beancount-py39/bin/python"