reconcile: CLI entrypoint, improve docs.

2022-03-02 10:05:07 +11:00 · 2022-03-02 10:05:07 +11:00 · fb5d0a57f3
commit fb5d0a57f3
parent 59dfbb78d1
2 changed files with 102 additions and 42 deletions
--- a/conservancy_beancount/reconcile/helper.py
+++ b/conservancy_beancount/reconcile/helper.py
@ -228,6 +228,7 @@ for desc, query in QUERIES.items():
 uncleared = [(r[0], r[2], r[4] or r[3], r[1]) for r in uncleared_rows]
 report_path = os.path.join(os.getenv('CONSERVANCY_REPOSITORY', ''), reconciliation_report_path(account, lastDateInPeriod))
 # TODO: Make the directory if it doesn't exist.
 with open(report_path, 'w') as f:
    f.write(reconciliation_report(account, lastDateInPeriod, cleared_balance, uncleared, '1900-01-01', all_trans_balance, []))
 print(f'Wrote reconciliation report: {report_path}.')
--- a/conservancy_beancount/reconcile/statement_reconciler.py
+++ b/conservancy_beancount/reconcile/statement_reconciler.py
@ -1,44 +1,97 @@
-"""Reconcile an AMEX/FR CSV statement against the books and print differences.
+"""Compare a bank CSV statement with the books.
-Beancount users often write importers to create bookkeeping entries direct from
+This tool takes an AMEX or First Republic CSV statement file and
-a bank statement or similar. That approach automates data entry and
+compares it line-by-line with the Beancount books to make sure that
-reconciliation in one step. In some cases though, it's useful to manually enter
+everything matches. This is designed for situations where transactions
-transactions and reconcile them later on. This workflow helpful in cases like
+are entered into the books directly, rather than being imported from a
-writing a paper check when there's a time lag between committing to making a
+statement after the fact.
 payment and the funds being debited. That's the workflow we're using here.
-Run like this:
+The reconciler will attempt to match transactions based on date,
 amount, check number and payee, but is forgiving to differences in
 dates, the absensce of check number and inexact matches on
 payee. Matches are ranked, so where there is only one decent match for
 an amount/date this is accepted, but if there are multiple similar
 candidates it will refuse to guess.
-$ python3 -m pip install thefuzz
+The reconciler will also attempt to identify where a single statement
-$ python3 conservancy_beancount/reconcile/statement_reconciler.py \
+entry has been split out into multiple Beancount postings, such as a
-  --beancount-file=$HOME/conservancy/beancount/books/2021.beancount \
+single bank transfer representing health insurance for multiple
-  --csv-statement=$HOME/conservancy/confidential/2021-09-10_AMEX_activity.csv \
+employees.
  --account=Liabilities:CreditCard:AMEX
-Conservancy currently enter data by hand rather than using Beancount importers.
+Run it like this:
-This tool is still somewhat like an importer in that it needs to extract
+
-transaction details from a third-party statement. Instead of creating
+$ statement_reconciler \
-directives, it just checks to see that similar directives are already present.
+  --beancount-file=2021.beancount \
  --account=Liabilities:CreditCard:AMEX \
  --csv-statement=2021-09-10_AMEX_activity.csv \
  --bank-statement=2021-09-10_AMEX_activity.csv \
  --statement-balance=1000
 Background:
 Beancount users often write importers to create bookkeeping entries
 direct from a bank statement or similar. That approach automates data
 entry and reconciliation in one step. In some cases though, it's
 useful to manually enter transactions and reconcile them later
 on. This workflow helpful in cases like writing a paper check when
 there's a time lag between committing to making a payment and the
 funds being debited. That's the workflow we're using here.
 Conservancy currently enter data by hand rather than using Beancount
 importers.  This tool is still somewhat like an importer in that it
 needs to extract transaction details from a third-party
 statement. Instead of creating directives, it just checks to see that
 similar directives are already present. This is a bit like diff-ing a
 statement with the books (though we're only interested in the presence
 of lines, not so much their order).
 Problems in scope:
- - errors in the books take hours to find during reconciliation ("you're entering a world of pain")
+
- - adding statement/reconciliation metadata to books is manual and prone to mistakes
+ - errors in the books take hours to find during reconciliation,
- - Beancount doesn't provide any infrastructure for programmatically updating the books, only appending
+   requiring manually comparing statemnts and the books and are
- - after updates to the books files, beancount must be restarted to reflect updates
+   succeptible to mistakes, such as not noticing when there are two
- - updates also invalidate the cache meaning restart takes several minutes
+   payments for the same amount on the statement, but not in the books
- - paper checks are entered in the books when written, but may not be cashed until months later (reconcile errors)
+   ("you're entering a world of pain")
- - balance checks are manually updated in svn/Financial/Ledger/sanity-check-balances.yaml
+
- - jumping to an individual transaction in a large ledger isn't trivial - Emacs grep mode is the current best option
+ - adding statement/reconciliation metadata to books is/was manual and
   prone to mistakes
 - Beancount doesn't provide any infrastructure for programmatically
   updating the books, only appending in the case of importers
 - paper checks are entered in the books when written, but may not be
   cashed until months later (reconcile errors)
 - jumping to an individual transaction in a large ledger isn't
   trivial - Emacs grep mode is the current best option
 - Pam and other staff don't use Emacs
 - auditors would prefer Bradley didn't perform reconciliation, ideally not Rosanne either
 - transactions are entered manually and reconciled after the fact, but importing from statements may be useful in some cases
-Q. How are reconciliation reports created currently? How do you read them?
+ - auditors would prefer Bradley didn't perform reconciliation,
- - by hand from copying and pasting from the helper tool output
+   ideally not Rosanne either
-Problem is potentially similar to diff-ing, but in the books, transaction order isn't super significant.
+ - reconciliation reports are created by hand when there are mismatches
 Other related problems we're not dealing with here:
 - after updates to the books files, beancount must be restarted to
   reflect updates
 - updates also invalidate the cache meaning restart takes several
   minutes
 - balance checks are manually updated in
   svn/Financial/Ledger/sanity-check-balances.yaml
 - transactions are entered manually and reconciled after the fact,
   but importing from statements may be useful in some cases
 """
 # TODO:
 #  - extract the magic numbers
 #  - consider merging in helper.py
 import argparse
 import collections
 import copy
@ -65,7 +118,7 @@ if not sys.warnoptions:
 from thefuzz import fuzz  # type: ignore
 logger = logging.getLogger()
-logger.setLevel(logging.DEBUG)
+logger.setLevel(logging.INFO)
 # Console logging.
 logger.addHandler(logging.StreamHandler())
@ -105,6 +158,14 @@ def remove_duplicate_words(text: str) -> str:
 def remove_payee_junk(payee: str) -> str:
    """Clean up payee field to improve quality of fuzzy matching.
    It turns out that bank statement "description" fields are
    difficult to fuzzy match on because they're long and
    noisey. Truncating them (see standardize_XXX_record fns) and
    removing the common junk helps significantly.
    """
    for r in JUNK_WORDS_RES:
        payee = r.sub('', payee)
    payee = ZERO_RE.sub('', payee)
@ -129,8 +190,11 @@ def read_transactions_from_csv(f: TextIO, standardize_statement_record: Callable
    return sort_records([standardize_statement_record(row, i) for i, row in enumerate(reader, 2)])
-# CSV reconciliation report.
+def validate_amex_csv(sample: str, account: str) -> None:
-# Merge helper script?
+    required_cols = {'Date', 'Amount', 'Description', 'Card Member'}
    reader = csv.DictReader(io.StringIO(sample))
    if reader.fieldnames and not required_cols.issubset(reader.fieldnames):
        sys.exit(f"This CSV doesn't seem to have the columns we're expecting, including: {', '.join(required_cols)}")
 def standardize_amex_record(row: Dict, line: int) -> Dict:
@ -147,13 +211,6 @@ def standardize_amex_record(row: Dict, line: int) -> Dict:
    }
 def validate_amex_csv(sample: str, account: str) -> None:
    required_cols = {'Date', 'Amount', 'Description', 'Card Member'}
    reader = csv.DictReader(io.StringIO(sample))
    if reader.fieldnames and not required_cols.issubset(reader.fieldnames):
        sys.exit(f"This CSV doesn't seem to have the columns we're expecting, including: {', '.join(required_cols)}")
 def validate_fr_csv(sample: str, account: str) -> None:
    required_cols = {'Date', 'Amount', 'Detail', 'Serial Num'}
    reader = csv.DictReader(io.StringIO(sample))
@ -346,8 +403,6 @@ def metadata_for_match(match: Tuple[List, List, List], statement_filename: str,
    return metadata
 # TODO: Is there a way to pull the side-effecting code out of this function?
 def write_metadata_to_books(metadata_to_apply: List[Tuple[str, int, str]]) -> None:
    """Insert reconciliation metadata in the books files.
@ -555,3 +610,7 @@ def main(args: argparse.Namespace) -> None:
 if __name__ == '__main__':
    args = parse_args(sys.argv)
    main(args)
 def entry_point():
    args = parse_args(sys.argv)
    main(args)