reconcile: CLI entrypoint, improve docs.

2022-03-02 10:05:07 +11:00 · 2022-03-02 10:05:07 +11:00 · fb5d0a57f3
commit fb5d0a57f3
parent 59dfbb78d1
2 changed files with 102 additions and 42 deletions
--- a/conservancy_beancount/reconcile/helper.py
+++ b/conservancy_beancount/reconcile/helper.py
@ -228,6 +228,7 @@ for desc, query in QUERIES.items():

 uncleared = [(r[0], r[2], r[4] or r[3], r[1]) for r in uncleared_rows]
 report_path = os.path.join(os.getenv('CONSERVANCY_REPOSITORY', ''), reconciliation_report_path(account, lastDateInPeriod))
+# TODO: Make the directory if it doesn't exist.
 with open(report_path, 'w') as f:
    f.write(reconciliation_report(account, lastDateInPeriod, cleared_balance, uncleared, '1900-01-01', all_trans_balance, []))
 print(f'Wrote reconciliation report: {report_path}.')
--- a/conservancy_beancount/reconcile/statement_reconciler.py
+++ b/conservancy_beancount/reconcile/statement_reconciler.py
@ -1,44 +1,97 @@
-"""Reconcile an AMEX/FR CSV statement against the books and print differences.
+"""Compare a bank CSV statement with the books.

-Beancount users often write importers to create bookkeeping entries direct from
-a bank statement or similar. That approach automates data entry and
-reconciliation in one step. In some cases though, it's useful to manually enter
-transactions and reconcile them later on. This workflow helpful in cases like
-writing a paper check when there's a time lag between committing to making a
-payment and the funds being debited. That's the workflow we're using here.
+This tool takes an AMEX or First Republic CSV statement file and
+compares it line-by-line with the Beancount books to make sure that
+everything matches. This is designed for situations where transactions
+are entered into the books directly, rather than being imported from a
+statement after the fact.

-Run like this:
+The reconciler will attempt to match transactions based on date,
+amount, check number and payee, but is forgiving to differences in
+dates, the absensce of check number and inexact matches on
+payee. Matches are ranked, so where there is only one decent match for
+an amount/date this is accepted, but if there are multiple similar
+candidates it will refuse to guess.

-$ python3 -m pip install thefuzz
-$ python3 conservancy_beancount/reconcile/statement_reconciler.py \
-  --beancount-file=$HOME/conservancy/beancount/books/2021.beancount \
-  --csv-statement=$HOME/conservancy/confidential/2021-09-10_AMEX_activity.csv \
-  --account=Liabilities:CreditCard:AMEX
+The reconciler will also attempt to identify where a single statement
+entry has been split out into multiple Beancount postings, such as a
+single bank transfer representing health insurance for multiple
+employees.

-Conservancy currently enter data by hand rather than using Beancount importers.
-This tool is still somewhat like an importer in that it needs to extract
-transaction details from a third-party statement. Instead of creating
-directives, it just checks to see that similar directives are already present.
+Run it like this:
+
+$ statement_reconciler \
+  --beancount-file=2021.beancount \
+  --account=Liabilities:CreditCard:AMEX \
+  --csv-statement=2021-09-10_AMEX_activity.csv \
+  --bank-statement=2021-09-10_AMEX_activity.csv \
+  --statement-balance=1000
+
+Background:
+
+Beancount users often write importers to create bookkeeping entries
+direct from a bank statement or similar. That approach automates data
+entry and reconciliation in one step. In some cases though, it's
+useful to manually enter transactions and reconcile them later
+on. This workflow helpful in cases like writing a paper check when
+there's a time lag between committing to making a payment and the
+funds being debited. That's the workflow we're using here.
+
+Conservancy currently enter data by hand rather than using Beancount
+importers.  This tool is still somewhat like an importer in that it
+needs to extract transaction details from a third-party
+statement. Instead of creating directives, it just checks to see that
+similar directives are already present. This is a bit like diff-ing a
+statement with the books (though we're only interested in the presence
+of lines, not so much their order).

 Problems in scope:
- - errors in the books take hours to find during reconciliation ("you're entering a world of pain")
- - adding statement/reconciliation metadata to books is manual and prone to mistakes
- - Beancount doesn't provide any infrastructure for programmatically updating the books, only appending
- - after updates to the books files, beancount must be restarted to reflect updates
- - updates also invalidate the cache meaning restart takes several minutes
- - paper checks are entered in the books when written, but may not be cashed until months later (reconcile errors)
- - balance checks are manually updated in svn/Financial/Ledger/sanity-check-balances.yaml
- - jumping to an individual transaction in a large ledger isn't trivial - Emacs grep mode is the current best option
+
+ - errors in the books take hours to find during reconciliation,
+   requiring manually comparing statemnts and the books and are
+   succeptible to mistakes, such as not noticing when there are two
+   payments for the same amount on the statement, but not in the books
+   ("you're entering a world of pain")
+
+ - adding statement/reconciliation metadata to books is/was manual and
+   prone to mistakes
+
+ - Beancount doesn't provide any infrastructure for programmatically
+   updating the books, only appending in the case of importers
+
+ - paper checks are entered in the books when written, but may not be
+   cashed until months later (reconcile errors)
+
+ - jumping to an individual transaction in a large ledger isn't
+   trivial - Emacs grep mode is the current best option
+
 - Pam and other staff don't use Emacs
- - auditors would prefer Bradley didn't perform reconciliation, ideally not Rosanne either
- - transactions are entered manually and reconciled after the fact, but importing from statements may be useful in some cases

-Q. How are reconciliation reports created currently? How do you read them?
- - by hand from copying and pasting from the helper tool output
+ - auditors would prefer Bradley didn't perform reconciliation,
+   ideally not Rosanne either

-Problem is potentially similar to diff-ing, but in the books, transaction order isn't super significant.
+ - reconciliation reports are created by hand when there are mismatches
+
+Other related problems we're not dealing with here:
+
+ - after updates to the books files, beancount must be restarted to
+   reflect updates
+
+ - updates also invalidate the cache meaning restart takes several
+   minutes
+
+ - balance checks are manually updated in
+   svn/Financial/Ledger/sanity-check-balances.yaml
+
+ - transactions are entered manually and reconciled after the fact,
+   but importing from statements may be useful in some cases

 """
+
+# TODO:
+#  - extract the magic numbers
+#  - consider merging in helper.py
+
 import argparse
 import collections
 import copy
@ -65,7 +118,7 @@ if not sys.warnoptions:
 from thefuzz import fuzz  # type: ignore

 logger = logging.getLogger()
-logger.setLevel(logging.DEBUG)
+logger.setLevel(logging.INFO)

 # Console logging.
 logger.addHandler(logging.StreamHandler())
@ -105,6 +158,14 @@ def remove_duplicate_words(text: str) -> str:


 def remove_payee_junk(payee: str) -> str:
+    """Clean up payee field to improve quality of fuzzy matching.
+
+    It turns out that bank statement "description" fields are
+    difficult to fuzzy match on because they're long and
+    noisey. Truncating them (see standardize_XXX_record fns) and
+    removing the common junk helps significantly.
+
+    """
    for r in JUNK_WORDS_RES:
        payee = r.sub('', payee)
    payee = ZERO_RE.sub('', payee)
@ -129,8 +190,11 @@ def read_transactions_from_csv(f: TextIO, standardize_statement_record: Callable
    return sort_records([standardize_statement_record(row, i) for i, row in enumerate(reader, 2)])


-# CSV reconciliation report.
-# Merge helper script?
+def validate_amex_csv(sample: str, account: str) -> None:
+    required_cols = {'Date', 'Amount', 'Description', 'Card Member'}
+    reader = csv.DictReader(io.StringIO(sample))
+    if reader.fieldnames and not required_cols.issubset(reader.fieldnames):
+        sys.exit(f"This CSV doesn't seem to have the columns we're expecting, including: {', '.join(required_cols)}")


 def standardize_amex_record(row: Dict, line: int) -> Dict:
@ -147,13 +211,6 @@ def standardize_amex_record(row: Dict, line: int) -> Dict:
    }


-def validate_amex_csv(sample: str, account: str) -> None:
-    required_cols = {'Date', 'Amount', 'Description', 'Card Member'}
-    reader = csv.DictReader(io.StringIO(sample))
-    if reader.fieldnames and not required_cols.issubset(reader.fieldnames):
-        sys.exit(f"This CSV doesn't seem to have the columns we're expecting, including: {', '.join(required_cols)}")
-
-
 def validate_fr_csv(sample: str, account: str) -> None:
    required_cols = {'Date', 'Amount', 'Detail', 'Serial Num'}
    reader = csv.DictReader(io.StringIO(sample))
@ -346,8 +403,6 @@ def metadata_for_match(match: Tuple[List, List, List], statement_filename: str,
    return metadata


-# TODO: Is there a way to pull the side-effecting code out of this function?
-
 def write_metadata_to_books(metadata_to_apply: List[Tuple[str, int, str]]) -> None:
    """Insert reconciliation metadata in the books files.

@ -555,3 +610,7 @@ def main(args: argparse.Namespace) -> None:
 if __name__ == '__main__':
    args = parse_args(sys.argv)
    main(args)
+
+def entry_point():
+    args = parse_args(sys.argv)
+    main(args)