reconcile: Update docs, move a few things around.

2022-03-03 10:55:09 +11:00 · 2022-03-03 10:55:09 +11:00 · 581ef0fa23
commit 581ef0fa23
parent 97a05003f3
1 changed files with 222 additions and 165 deletions
--- a/conservancy_beancount/reconcile/statement_reconciler.py
+++ b/conservancy_beancount/reconcile/statement_reconciler.py
@ -44,23 +44,25 @@ similar directives are already present. This is a bit like diff-ing a
 statement with the books (though we're only interested in the presence
 of lines, not so much their order).

+Paper checks are entered in the books when written (a.k.a. "posted"),
+but may not be cashed until months later sometimes causing
+reconciliation differences that live beyond a month. It's worth noting
+that there are really two dates here - the posting date and the
+cleared date. Beancount only allows us to model one, which is why
+carrying these reconciliation differences between months feels a bit
+awkward.
+
 Problems in scope:

 - errors in the books take hours to find during reconciliation,
-   requiring manually comparing statemnts and the books and are
+   requiring manually comparing statements and the books and are
   succeptible to mistakes, such as not noticing when there are two
   payments for the same amount on the statement, but not in the books
-   ("you're entering a world of pain")
+   (as Bradley likes to quote, "you're entering a world of pain")

 - adding statement/reconciliation metadata to books is/was manual and
   prone to mistakes

- - Beancount doesn't provide any infrastructure for programmatically
-   updating the books, only appending in the case of importers
-
- - paper checks are entered in the books when written, but may not be
-   cashed until months later (reconcile errors)
-
 - jumping to an individual transaction in a large ledger isn't
   trivial - Emacs grep mode is the current best option

@ -244,6 +246,7 @@ def standardize_beancount_record(row) -> Dict:  # type: ignore[no-untyped-def]


 def format_record(record: dict) -> str:
+    """Generate output lines for a standard 1:1 match."""
    if record['payee'] and record['check_id']:
        output = f"{record['date'].isoformat()}: {record['amount']:11,.2f} {record['payee'][:25]} #{record['check_id']}".ljust(59)
    elif record['payee']:
@ -254,6 +257,7 @@ def format_record(record: dict) -> str:


 def format_multirecord(r1s: list[dict], r2s: list[dict], note: str) -> list[list]:
+    """Generates output lines for one statement:multiple books transaction match."""
    assert len(r1s) == 1
    assert len(r2s) > 1
    match_output = []
@ -268,6 +272,13 @@ def sort_records(records: List) -> List:


 def first_word_exact_match(a: str, b: str) -> float:
+    """Score a payee match based first word.
+
+    We get a whole lot of good matches this way. Helps in the
+    situation where the first word or two of a transaction description
+    is useful and the rest is garbage.
+
+    """
    if len(a) == 0 or len(b) == 0:
        return 0.0
    first_a = a.split()[0].strip()
@ -279,6 +290,7 @@ def first_word_exact_match(a: str, b: str) -> float:


 def payee_match(a: str, b: str) -> float:
+    """Score a match between two payees."""
    fuzzy_match = float(fuzz.token_set_ratio(a, b) / 100.00)
    first_word_match = first_word_exact_match(a, b)
    return max(fuzzy_match, first_word_match)
@ -286,7 +298,6 @@ def payee_match(a: str, b: str) -> float:

 def records_match(r1: Dict, r2: Dict) -> Tuple[float, List[str]]:
    """Do these records represent the same transaction?"""
-
    date_score = date_proximity(r1['date'], r2['date'])
    if r1['date'] == r2['date']:
        date_message = ''
@ -329,11 +340,20 @@ def records_match(r1: Dict, r2: Dict) -> Tuple[float, List[str]]:


 def match_statement_and_books(statement_trans: List[Dict], books_trans: List[Dict]) -> Tuple[List[Tuple[List, List, List]], List[Dict], List[Dict]]:
-    """
-    Runs through all the statement transactions to find a matching transaction
-    in the books. If found, the books transaction is marked off so that it can
-    only be matched once. Some transactions will be matched, some will be on the
-    statement but not the books and some on the books but not the statement.
+    """Match transactions between the statement and books.
+
+    If matched, the books transaction is marked off so that it can
+    only be matched once. Some transactions will be matched, some will
+    be on the statement but not the books and some on the books but
+    not the statement.
+
+    Passes through any unmatched transactions.
+
+    Currently we use the same matching logic for all types of
+    statements. It's conceivable that you could have special cases to
+    accurately match some types of statements, but that would be more
+    work to maintain and test.
+
    """
    matches = []
    remaining_books_trans = []
@ -363,141 +383,14 @@ def match_statement_and_books(statement_trans: List[Dict], books_trans: List[Dic
    return matches, remaining_statement_trans, remaining_books_trans


-# TODO: Return list of tuples (instead of list of lists).
+def subset_match(statement_trans: List[dict], books_trans: List[dict]) -> Tuple[List[Tuple[List, List, List]], List[Dict], List[Dict]]:
+    """Match single statement transactions with multiple books transactions.

-def format_matches(matches: List, csv_statement: str, show_reconciled_matches: bool) -> List[List]:
-    match_output = []
-    for r1s, r2s, note in matches:
-        note = ', '.join(note)
-        note = ': ' + note if note else note
-        if r1s and r2s:
-            if show_reconciled_matches or not all(x['bank_statement'] for x in r2s):
-                if len(r2s) == 1:
-                    entry = [r1s[0]['date'], f'{format_record(r1s[0])}  →  {format_record(r2s[0])}  ✓ Matched{note}']
-                    if 'payee mismatch' in note:
-                        entry[1] = Fore.YELLOW + Style.BRIGHT + entry[1] + Style.RESET_ALL
-                    match_output.append(entry)
-                else:
-                    match_output.extend(format_multirecord(r1s, r2s, note))
-        elif r1s:
-            match_output.append([r1s[0]['date'], Fore.RED + Style.BRIGHT + f'{format_record(r1s[0])}  →  {" ":^59}  ✗ NOT IN BOOKS ({os.path.basename(csv_statement)}:{r1s[0]["line"]})' + Style.RESET_ALL])
-        else:
-            match_output.append([r2s[0]['date'], Fore.RED + Style.BRIGHT + f'{" ":^59}  →  {format_record(r2s[0])}  ✗ NOT ON STATEMENT ({os.path.basename(r2s[0]["filename"])}:{r2s[0]["line"]})' + Style.RESET_ALL])
-    return match_output
-
-
-def date_proximity(d1: datetime.date, d2: datetime.date) -> float:
-    diff = abs(int((d1 - d2).days))
-    if diff > 60:
-        return 0.0
-    else:
-        return 1.0 - (diff / 60.0)
-
-
-def metadata_for_match(match: Tuple[List, List, List], statement_filename: str, csv_filename: str) -> List[Tuple[str, int, str]]:
-    # Can we really ever have multiple statement entries? Probably not.
-    statement_filename = get_repo_relative_path(statement_filename)
-    csv_filename = get_repo_relative_path(csv_filename)
-    metadata = []
-    statement_entries, books_entries, _ = match
-    for books_entry in books_entries:
-        for statement_entry in statement_entries:
-            if not books_entry['bank_statement']:
-                metadata.append((books_entry['filename'], books_entry['line'], f'    bank-statement: "{statement_filename}"'))
-                metadata.append((books_entry['filename'], books_entry['line'], f'    bank-statement-csv: "{csv_filename}:{statement_entry["line"]}"'))
-    return metadata
-
-
-def write_metadata_to_books(metadata_to_apply: List[Tuple[str, int, str]]) -> None:
-    """Insert reconciliation metadata in the books files.
-
-    Takes a list of edits to make as tuples of form (filename, lineno, metadata):
-
-    [
-        ('2021/main.beancount', 4245, '    bank-statement: statement.pdf'),
-        ('2021/main.beancount', 1057, '    bank-statement: statement.pdf'),
-        ('2021/payroll.beancount', 257, '    bank-statement: statement.pdf'),
-        ...,
-    ]
+    Works similarly to match_statement_and_books in that it returns a
+    list of matches and lists of remaining statement and books
+    transactions.

    """
-    file_contents: dict[str, list] = {}
-    file_offsets: dict[str, int] = collections.defaultdict(int)
-    # Load each books file into memory and insert the relevant metadata lines.
-    # Line numbers change as we do this, so we keep track of the offset for each
-    # file. Changes must be sorted by line number first or else the offsets will
-    # break because we're jumping around making edits.
-    for filename, line, metadata in sorted(metadata_to_apply):
-        if filename not in file_contents:
-            with open(filename, 'r') as f:
-                file_contents[filename] = f.readlines()
-        # Insert is inefficient, but fast enough for now in practise.
-        file_contents[filename].insert(line + file_offsets[filename], metadata.rstrip() + '\n')
-        file_offsets[filename] += 1
-    # Writes each updated file back to disk.
-    for filename, contents in file_contents.items():
-        with open(filename, 'w') as f:
-            f.writelines(contents)
-            print(f'Wrote {filename}.')
-
-
-def get_repo_relative_path(path: str) -> str:
-    return os.path.relpath(path, start=os.getenv('CONSERVANCY_REPOSITORY'))
-
-
-def parse_path(path: str) -> str:
-    if not os.path.exists(path):
-        raise argparse.ArgumentTypeError(f'File {path} does not exist.')
-    return path
-
-
-def parse_repo_relative_path(path: str) -> str:
-    if not os.path.exists(path):
-        raise argparse.ArgumentTypeError(f'File {path} does not exist.')
-    repo = os.getenv('CONSERVANCY_REPOSITORY')
-    if not repo:
-        raise argparse.ArgumentTypeError('$CONSERVANCY_REPOSITORY is not set.')
-    if not path.startswith(repo):
-        raise argparse.ArgumentTypeError(f'File {path} does not share a common prefix with $CONSERVANCY_REPOSITORY {repo}.')
-    return path
-
-
-def parse_decimal_with_separator(number_text: str) -> decimal.Decimal:
-    """decimal.Decimal can't parse numbers with thousands separator."""
-    number_text = number_text.replace(',', '')
-    return decimal.Decimal(number_text)
-
-
-def parse_arguments(argv: List[str]) -> argparse.Namespace:
-    parser = argparse.ArgumentParser(prog=PROGNAME, description='Reconciliation helper')
-    cliutil.add_version_argument(parser)
-    cliutil.add_loglevel_argument(parser)
-    parser.add_argument('--beancount-file', required=True, type=parse_path)
-    parser.add_argument('--csv-statement', required=True, type=parse_repo_relative_path)
-    parser.add_argument('--bank-statement', required=True, type=parse_repo_relative_path)
-    parser.add_argument('--account', required=True, help='eg. Liabilities:CreditCard:AMEX')
-    # parser.add_argument('--report-group-regex')
-    parser.add_argument('--show-reconciled-matches', action='store_true')
-    parser.add_argument('--non-interactive', action='store_true', help="Don't prompt to write to the books")    # parser.add_argument('--statement-balance', type=parse_decimal_with_separator, required=True, help="A.K.A \"cleared balance\" taken from the end of the period on the PDF statement. Required because CSV statements don't include final or running totals")
-    args = parser.parse_args(args=argv)
-    return args
-
-
-def totals(matches: List[Tuple[List, List, List]]) -> Tuple[decimal.Decimal, decimal.Decimal, decimal.Decimal]:
-    total_matched = decimal.Decimal(0)
-    total_missing_from_books = decimal.Decimal(0)
-    total_missing_from_statement = decimal.Decimal(0)
-    for statement_entries, books_entries, _ in matches:
-        if statement_entries and books_entries:
-            total_matched += sum(c['amount'] for c in statement_entries)
-        elif statement_entries:
-            total_missing_from_books += sum(c['amount'] for c in statement_entries)
-        else:
-            total_missing_from_statement += sum(c['amount'] for c in books_entries)
-    return total_matched, total_missing_from_books, total_missing_from_statement
-
-
-def subset_match(statement_trans: List[dict], books_trans: List[dict]) -> Tuple[List[Tuple[List, List, List]], List[Dict], List[Dict]]:
    matches = []
    remaining_books_trans = []
    remaining_statement_trans = []
@ -531,7 +424,171 @@ def subset_match(statement_trans: List[dict], books_trans: List[dict]) -> Tuple[
    return matches, remaining_statement_trans, remaining_books_trans


+# TODO: Return list of tuples (instead of list of lists).
+
+def format_matches(matches: List, csv_statement: str, show_reconciled_matches: bool) -> List[List]:
+
+    """Produce a list of body output lines from the given matches.
+
+    The first column is a date so we can re-sort the list to put the
+    missing entries in the right place. The second column is the text
+    output.
+
+    """
+    match_output = []
+    for r1s, r2s, note in matches:
+        note = ', '.join(note)
+        note = ': ' + note if note else note
+        if r1s and r2s:
+            if show_reconciled_matches or not all(x['bank_statement'] for x in r2s):
+                if len(r2s) == 1:
+                    entry = [r1s[0]['date'], f'{format_record(r1s[0])}  →  {format_record(r2s[0])}  ✓ Matched{note}']
+                    if 'payee mismatch' in note:
+                        entry[1] = Fore.YELLOW + Style.BRIGHT + entry[1] + Style.RESET_ALL
+                    match_output.append(entry)
+                else:
+                    match_output.extend(format_multirecord(r1s, r2s, note))
+        elif r1s:
+            match_output.append([r1s[0]['date'], Fore.RED + Style.BRIGHT + f'{format_record(r1s[0])}  →  {" ":^59}  ✗ NOT IN BOOKS ({os.path.basename(csv_statement)}:{r1s[0]["line"]})' + Style.RESET_ALL])
+        else:
+            match_output.append([r2s[0]['date'], Fore.RED + Style.BRIGHT + f'{" ":^59}  →  {format_record(r2s[0])}  ✗ NOT ON STATEMENT ({os.path.basename(r2s[0]["filename"])}:{r2s[0]["line"]})' + Style.RESET_ALL])
+    return match_output
+
+
+def date_proximity(d1: datetime.date, d2: datetime.date) -> float:
+    """Scores two days based on how close they are together."""
+    ZERO_CUTOFF = 60  # Score will be zero for this many days apart.
+    diff = abs(int((d1 - d2).days))
+    if diff >= ZERO_CUTOFF:
+        return 0.0
+    else:
+        return 1.0 - (diff / ZERO_CUTOFF)
+
+
+def metadata_for_match(match: Tuple[List, List, List], statement_filename: str, csv_filename: str) -> List[Tuple[str, int, str]]:
+    """Returns the bank-statement metadata that should be applied for a match."""
+    # TODO: Our data structure would allow multiple statement entries
+    # for a match, but would this ever make sense? Probably not.
+    statement_filename = get_repo_relative_path(statement_filename)
+    csv_filename = get_repo_relative_path(csv_filename)
+    metadata = []
+    statement_entries, books_entries, _ = match
+    for books_entry in books_entries:
+        for statement_entry in statement_entries:
+            if not books_entry['bank_statement']:
+                metadata.append((books_entry['filename'], books_entry['line'], f'    bank-statement: "{statement_filename}"'))
+                metadata.append((books_entry['filename'], books_entry['line'], f'    bank-statement-csv: "{csv_filename}:{statement_entry["line"]}"'))
+    return metadata
+
+
+def write_metadata_to_books(metadata_to_apply: List[Tuple[str, int, str]]) -> None:
+    """Insert reconciliation metadata in the books files.
+
+    Takes a list of edits to make as tuples of form (filename, lineno, metadata):
+
+    [
+        ('2021/main.beancount', 4245, '    bank-statement: statement.pdf'),
+        ('2021/main.beancount', 1057, '    bank-statement: statement.pdf'),
+        ('2021/payroll.beancount', 257, '    bank-statement: statement.pdf'),
+        ...,
+    ]
+
+    Beancount doesn't provide any infrastructure for programmatically
+    updating the books, only appending in the case of importers. So
+    we're on our own here.
+    """
+    file_contents: dict[str, list] = {}
+    file_offsets: dict[str, int] = collections.defaultdict(int)
+    # Load each books file into memory and insert the relevant metadata lines.
+    # Line numbers change as we do this, so we keep track of the offset for each
+    # file. Changes must be sorted by line number first or else the offsets will
+    # break because we're jumping around making edits.
+    for filename, line, metadata in sorted(metadata_to_apply):
+        if filename not in file_contents:
+            with open(filename, 'r') as f:
+                file_contents[filename] = f.readlines()
+        # Insert is inefficient, but fast enough for now in practise.
+        file_contents[filename].insert(line + file_offsets[filename], metadata.rstrip() + '\n')
+        file_offsets[filename] += 1
+    # Writes each updated file back to disk.
+    for filename, contents in file_contents.items():
+        with open(filename, 'w') as f:
+            f.writelines(contents)
+            print(f'Wrote {filename}.')
+
+
+def get_repo_relative_path(path: str) -> str:
+    """Chop off the unique per-person CONSERVANCY_REPOSITORY.
+
+    CSV and PDF statement metadata should be relative to
+    CONSERVANCY_REPOSITORY ie. without regards to exactly where on
+    your computer all the files live.
+
+    """
+    return os.path.relpath(path, start=os.getenv('CONSERVANCY_REPOSITORY'))
+
+
+def parse_path(path: str) -> str:
+    """Validate that a file exists for use in argparse."""
+    if not os.path.exists(path):
+        raise argparse.ArgumentTypeError(f'File {path} does not exist.')
+    return path
+
+
+def parse_repo_relative_path(path: str) -> str:
+    """Validate that a file exists and is within $CONSERVANCY_REPOSITORY.
+
+    For use with argparse.
+
+    """
+    if not os.path.exists(path):
+        raise argparse.ArgumentTypeError(f'File {path} does not exist.')
+    repo = os.getenv('CONSERVANCY_REPOSITORY')
+    if not repo:
+        raise argparse.ArgumentTypeError('$CONSERVANCY_REPOSITORY is not set.')
+    if not path.startswith(repo):
+        raise argparse.ArgumentTypeError(f'File {path} does not share a common prefix with $CONSERVANCY_REPOSITORY {repo}.')
+    return path
+
+
+def parse_decimal_with_separator(number_text: str) -> decimal.Decimal:
+    """decimal.Decimal can't parse numbers with thousands separator."""
+    number_text = number_text.replace(',', '')
+    return decimal.Decimal(number_text)
+
+
+def parse_arguments(argv: List[str]) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(prog=PROGNAME, description='Reconciliation helper')
+    cliutil.add_version_argument(parser)
+    cliutil.add_loglevel_argument(parser)
+    parser.add_argument('--beancount-file', required=True, type=parse_path)
+    parser.add_argument('--csv-statement', required=True, type=parse_repo_relative_path)
+    parser.add_argument('--bank-statement', required=True, type=parse_repo_relative_path)
+    parser.add_argument('--account', required=True, help='eg. Liabilities:CreditCard:AMEX')
+    # parser.add_argument('--report-group-regex')
+    parser.add_argument('--show-reconciled-matches', action='store_true')
+    parser.add_argument('--non-interactive', action='store_true', help="Don't prompt to write to the books")    # parser.add_argument('--statement-balance', type=parse_decimal_with_separator, required=True, help="A.K.A \"cleared balance\" taken from the end of the period on the PDF statement. Required because CSV statements don't include final or running totals")
+    args = parser.parse_args(args=argv)
+    return args
+
+
+def totals(matches: List[Tuple[List, List, List]]) -> Tuple[decimal.Decimal, decimal.Decimal, decimal.Decimal]:
+    """Calculate the totals of transactions matched/not-matched."""
+    total_matched = decimal.Decimal(0)
+    total_missing_from_books = decimal.Decimal(0)
+    total_missing_from_statement = decimal.Decimal(0)
+    for statement_entries, books_entries, _ in matches:
+        if statement_entries and books_entries:
+            total_matched += sum(c['amount'] for c in statement_entries)
+        elif statement_entries:
+            total_missing_from_books += sum(c['amount'] for c in statement_entries)
+        else:
+            total_missing_from_statement += sum(c['amount'] for c in books_entries)
+    return total_matched, total_missing_from_books, total_missing_from_statement
+
+
 def process_unmatched(statement_trans: List[dict], books_trans: List[dict]) -> List[Tuple[List, List, List]]:
+    """Format the remaining unmatched transactions to be added to one single list of matches."""
    matches: List[Tuple[List, List, List]] = []
    for r1 in statement_trans:
        matches.append(([r1], [], ['no match']))
@ -551,11 +608,8 @@ def main(arglist: Optional[Sequence[str]] = None,
        config = configmod.Config()
        config.load_file()

-    # TODO: Should put in a sanity check to make sure the statement you're feeding
-    # in matches the account you've provided.
-
-    # TODO: Can we open the files first, then pass the streams on to the rest of the program?
-
+    # Validate and normalise the statement into our standard
+    # transaction data structure.
    if 'AMEX' in args.account:
        validate_csv = validate_amex_csv
        standardize_statement_record = standardize_amex_record
@ -569,40 +623,43 @@ def main(arglist: Optional[Sequence[str]] = None,
        f.seek(0)
        statement_trans = read_transactions_from_csv(f, standardize_statement_record)

+    # Dates are taken from the beginning/end of the statement.
    begin_date = statement_trans[0]['date']
    end_date = statement_trans[-1]['date']

-    # Do we traverse and filter the in-memory entries list and filter that, or do we
-    # use Beancount Query Language (BQL) to get a list of transactions? Currently
-    # using BQL.
+    # Query for the Beancount books data for this above period.
+    #
+    # There are pros and cons for using Beancount's in-memory entries
+    # list directly and also for using Beancount Query Language (BQL)
+    # to get a list of transactions? Using BQL because it's
+    # convenient, but we don't have access to the full transaction
+    # entry objects. Feels a bit strange that these approaches are so
+    # disconnected.
    #
    # beancount.query.query_compile.compile() and
    # beancount.query.query_execute.filter_entries() look useful in this respect,
    # but I'm not clear on how to use compile(). An example would help.
    entries, _, options = loader.load_file(args.beancount_file)
-
-    # books_balance_query = f"""SELECT sum(COST(position)) AS aa WHERE account = "{args.account}"
-    #     AND date <= {end_date.isoformat()}"""
-    # _, result_rows = run_query(entries, options, books_balance_query, numberify=True)
-    # books_balance = result_rows[0][0] if result_rows else 0
-
    # String concatenation looks bad, but there's no SQL injection possible here
    # because BQL can't write back to the Beancount files. I hope!
    query = f'SELECT filename, META("lineno") AS line, META("bank-statement") AS bank_statement, date, number(cost(position)), payee, ENTRY_META("entity") as entity, ANY_META("check-id") as check_id, narration where account = "{args.account}" and date >= {begin_date} and date <= {end_date}'
    _, result_rows = run_query(entries, options, query)
-
    books_trans = sort_records([standardize_beancount_record(row) for row in result_rows])

+    # Apply two passes of matching, one for standard matches and one
+    # for subset matches.
    matches, remaining_statement_trans, remaining_books_trans = match_statement_and_books(statement_trans, books_trans)
-    subset_matches, remaining_statement_trans, remaining_books_trans = subset_match(remaining_statement_trans, remaining_books_trans)
+    subset_matches, remaining_statement_trans, remaining_books_trans = subset_match(
+        remaining_statement_trans, remaining_books_trans)
    matches.extend(subset_matches)
+
+    # Add the remaining unmatched to make one big list of matches, successful or not.
    unmatched = process_unmatched(remaining_statement_trans, remaining_books_trans)
    matches.extend(unmatched)

+    # Print out results of our matching.
    match_output = format_matches(matches, args.csv_statement, args.show_reconciled_matches)
-
    _, total_missing_from_books, total_missing_from_statement = totals(matches)
-
    print('-' * 155)
    statement_heading = f'Statement transactions {begin_date} to {end_date}'
    print(f'{statement_heading:<52}            {"Books transactions":<58}   Notes')
@ -615,7 +672,7 @@ def main(arglist: Optional[Sequence[str]] = None,
    print(f'Total:                      {total_missing_from_statement + total_missing_from_books:12,.2f}')
    print('-' * 155)

-    # Write statement metadata back to books
+    # Write statement metadata back to the books.
    metadata_to_apply = []
    for match in matches:
        metadata_to_apply.extend(metadata_for_match(match, args.bank_statement, args.csv_statement))