reconcile: Update docs, move a few things around.

2022-03-03 10:55:09 +11:00 · 2022-03-03 10:55:09 +11:00 · 581ef0fa23
commit 581ef0fa23
parent 97a05003f3
1 changed files with 222 additions and 165 deletions
--- a/conservancy_beancount/reconcile/statement_reconciler.py
+++ b/conservancy_beancount/reconcile/statement_reconciler.py
@ -44,23 +44,25 @@ similar directives are already present. This is a bit like diff-ing a
 statement with the books (though we're only interested in the presence
 of lines, not so much their order).
 Paper checks are entered in the books when written (a.k.a. "posted"),
 but may not be cashed until months later sometimes causing
 reconciliation differences that live beyond a month. It's worth noting
 that there are really two dates here - the posting date and the
 cleared date. Beancount only allows us to model one, which is why
 carrying these reconciliation differences between months feels a bit
 awkward.
 Problems in scope:
 - errors in the books take hours to find during reconciliation,
-   requiring manually comparing statemnts and the books and are
+   requiring manually comparing statements and the books and are
   succeptible to mistakes, such as not noticing when there are two
   payments for the same amount on the statement, but not in the books
-   ("you're entering a world of pain")
+   (as Bradley likes to quote, "you're entering a world of pain")
 - adding statement/reconciliation metadata to books is/was manual and
   prone to mistakes
 - Beancount doesn't provide any infrastructure for programmatically
   updating the books, only appending in the case of importers
 - paper checks are entered in the books when written, but may not be
   cashed until months later (reconcile errors)
 - jumping to an individual transaction in a large ledger isn't
   trivial - Emacs grep mode is the current best option
@ -244,6 +246,7 @@ def standardize_beancount_record(row) -> Dict:  # type: ignore[no-untyped-def]
 def format_record(record: dict) -> str:
    """Generate output lines for a standard 1:1 match."""
    if record['payee'] and record['check_id']:
        output = f"{record['date'].isoformat()}: {record['amount']:11,.2f} {record['payee'][:25]} #{record['check_id']}".ljust(59)
    elif record['payee']:
@ -254,6 +257,7 @@ def format_record(record: dict) -> str:
 def format_multirecord(r1s: list[dict], r2s: list[dict], note: str) -> list[list]:
    """Generates output lines for one statement:multiple books transaction match."""
    assert len(r1s) == 1
    assert len(r2s) > 1
    match_output = []
@ -268,6 +272,13 @@ def sort_records(records: List) -> List:
 def first_word_exact_match(a: str, b: str) -> float:
    """Score a payee match based first word.
    We get a whole lot of good matches this way. Helps in the
    situation where the first word or two of a transaction description
    is useful and the rest is garbage.
    """
    if len(a) == 0 or len(b) == 0:
        return 0.0
    first_a = a.split()[0].strip()
@ -279,6 +290,7 @@ def first_word_exact_match(a: str, b: str) -> float:
 def payee_match(a: str, b: str) -> float:
    """Score a match between two payees."""
    fuzzy_match = float(fuzz.token_set_ratio(a, b) / 100.00)
    first_word_match = first_word_exact_match(a, b)
    return max(fuzzy_match, first_word_match)
@ -286,7 +298,6 @@ def payee_match(a: str, b: str) -> float:
 def records_match(r1: Dict, r2: Dict) -> Tuple[float, List[str]]:
    """Do these records represent the same transaction?"""
    date_score = date_proximity(r1['date'], r2['date'])
    if r1['date'] == r2['date']:
        date_message = ''
@ -329,11 +340,20 @@ def records_match(r1: Dict, r2: Dict) -> Tuple[float, List[str]]:
 def match_statement_and_books(statement_trans: List[Dict], books_trans: List[Dict]) -> Tuple[List[Tuple[List, List, List]], List[Dict], List[Dict]]:
-    """
+    """Match transactions between the statement and books.
-    Runs through all the statement transactions to find a matching transaction
+
-    in the books. If found, the books transaction is marked off so that it can
+    If matched, the books transaction is marked off so that it can
-    only be matched once. Some transactions will be matched, some will be on the
+    only be matched once. Some transactions will be matched, some will
-    statement but not the books and some on the books but not the statement.
+    be on the statement but not the books and some on the books but
    not the statement.
    Passes through any unmatched transactions.
    Currently we use the same matching logic for all types of
    statements. It's conceivable that you could have special cases to
    accurately match some types of statements, but that would be more
    work to maintain and test.
    """
    matches = []
    remaining_books_trans = []
@ -363,141 +383,14 @@ def match_statement_and_books(statement_trans: List[Dict], books_trans: List[Dic
    return matches, remaining_statement_trans, remaining_books_trans
-# TODO: Return list of tuples (instead of list of lists).
+def subset_match(statement_trans: List[dict], books_trans: List[dict]) -> Tuple[List[Tuple[List, List, List]], List[Dict], List[Dict]]:
    """Match single statement transactions with multiple books transactions.
-def format_matches(matches: List, csv_statement: str, show_reconciled_matches: bool) -> List[List]:
+    Works similarly to match_statement_and_books in that it returns a
-    match_output = []
+    list of matches and lists of remaining statement and books
-    for r1s, r2s, note in matches:
+    transactions.
        note = ', '.join(note)
        note = ': ' + note if note else note
        if r1s and r2s:
            if show_reconciled_matches or not all(x['bank_statement'] for x in r2s):
                if len(r2s) == 1:
                    entry = [r1s[0]['date'], f'{format_record(r1s[0])}  →  {format_record(r2s[0])}  ✓ Matched{note}']
                    if 'payee mismatch' in note:
                        entry[1] = Fore.YELLOW + Style.BRIGHT + entry[1] + Style.RESET_ALL
                    match_output.append(entry)
                else:
                    match_output.extend(format_multirecord(r1s, r2s, note))
        elif r1s:
            match_output.append([r1s[0]['date'], Fore.RED + Style.BRIGHT + f'{format_record(r1s[0])}  →  {" ":^59}  ✗ NOT IN BOOKS ({os.path.basename(csv_statement)}:{r1s[0]["line"]})' + Style.RESET_ALL])
        else:
            match_output.append([r2s[0]['date'], Fore.RED + Style.BRIGHT + f'{" ":^59}  →  {format_record(r2s[0])}  ✗ NOT ON STATEMENT ({os.path.basename(r2s[0]["filename"])}:{r2s[0]["line"]})' + Style.RESET_ALL])
    return match_output
 def date_proximity(d1: datetime.date, d2: datetime.date) -> float:
    diff = abs(int((d1 - d2).days))
    if diff > 60:
        return 0.0
    else:
        return 1.0 - (diff / 60.0)
 def metadata_for_match(match: Tuple[List, List, List], statement_filename: str, csv_filename: str) -> List[Tuple[str, int, str]]:
    # Can we really ever have multiple statement entries? Probably not.
    statement_filename = get_repo_relative_path(statement_filename)
    csv_filename = get_repo_relative_path(csv_filename)
    metadata = []
    statement_entries, books_entries, _ = match
    for books_entry in books_entries:
        for statement_entry in statement_entries:
            if not books_entry['bank_statement']:
                metadata.append((books_entry['filename'], books_entry['line'], f'    bank-statement: "{statement_filename}"'))
                metadata.append((books_entry['filename'], books_entry['line'], f'    bank-statement-csv: "{csv_filename}:{statement_entry["line"]}"'))
    return metadata
 def write_metadata_to_books(metadata_to_apply: List[Tuple[str, int, str]]) -> None:
    """Insert reconciliation metadata in the books files.
    Takes a list of edits to make as tuples of form (filename, lineno, metadata):
    [
        ('2021/main.beancount', 4245, '    bank-statement: statement.pdf'),
        ('2021/main.beancount', 1057, '    bank-statement: statement.pdf'),
        ('2021/payroll.beancount', 257, '    bank-statement: statement.pdf'),
        ...,
    ]
    """
    file_contents: dict[str, list] = {}
    file_offsets: dict[str, int] = collections.defaultdict(int)
    # Load each books file into memory and insert the relevant metadata lines.
    # Line numbers change as we do this, so we keep track of the offset for each
    # file. Changes must be sorted by line number first or else the offsets will
    # break because we're jumping around making edits.
    for filename, line, metadata in sorted(metadata_to_apply):
        if filename not in file_contents:
            with open(filename, 'r') as f:
                file_contents[filename] = f.readlines()
        # Insert is inefficient, but fast enough for now in practise.
        file_contents[filename].insert(line + file_offsets[filename], metadata.rstrip() + '\n')
        file_offsets[filename] += 1
    # Writes each updated file back to disk.
    for filename, contents in file_contents.items():
        with open(filename, 'w') as f:
            f.writelines(contents)
            print(f'Wrote {filename}.')
 def get_repo_relative_path(path: str) -> str:
    return os.path.relpath(path, start=os.getenv('CONSERVANCY_REPOSITORY'))
 def parse_path(path: str) -> str:
    if not os.path.exists(path):
        raise argparse.ArgumentTypeError(f'File {path} does not exist.')
    return path
 def parse_repo_relative_path(path: str) -> str:
    if not os.path.exists(path):
        raise argparse.ArgumentTypeError(f'File {path} does not exist.')
    repo = os.getenv('CONSERVANCY_REPOSITORY')
    if not repo:
        raise argparse.ArgumentTypeError('$CONSERVANCY_REPOSITORY is not set.')
    if not path.startswith(repo):
        raise argparse.ArgumentTypeError(f'File {path} does not share a common prefix with $CONSERVANCY_REPOSITORY {repo}.')
    return path
 def parse_decimal_with_separator(number_text: str) -> decimal.Decimal:
    """decimal.Decimal can't parse numbers with thousands separator."""
    number_text = number_text.replace(',', '')
    return decimal.Decimal(number_text)
 def parse_arguments(argv: List[str]) -> argparse.Namespace:
    parser = argparse.ArgumentParser(prog=PROGNAME, description='Reconciliation helper')
    cliutil.add_version_argument(parser)
    cliutil.add_loglevel_argument(parser)
    parser.add_argument('--beancount-file', required=True, type=parse_path)
    parser.add_argument('--csv-statement', required=True, type=parse_repo_relative_path)
    parser.add_argument('--bank-statement', required=True, type=parse_repo_relative_path)
    parser.add_argument('--account', required=True, help='eg. Liabilities:CreditCard:AMEX')
    # parser.add_argument('--report-group-regex')
    parser.add_argument('--show-reconciled-matches', action='store_true')
    parser.add_argument('--non-interactive', action='store_true', help="Don't prompt to write to the books")    # parser.add_argument('--statement-balance', type=parse_decimal_with_separator, required=True, help="A.K.A \"cleared balance\" taken from the end of the period on the PDF statement. Required because CSV statements don't include final or running totals")
    args = parser.parse_args(args=argv)
    return args
 def totals(matches: List[Tuple[List, List, List]]) -> Tuple[decimal.Decimal, decimal.Decimal, decimal.Decimal]:
    total_matched = decimal.Decimal(0)
    total_missing_from_books = decimal.Decimal(0)
    total_missing_from_statement = decimal.Decimal(0)
    for statement_entries, books_entries, _ in matches:
        if statement_entries and books_entries:
            total_matched += sum(c['amount'] for c in statement_entries)
        elif statement_entries:
            total_missing_from_books += sum(c['amount'] for c in statement_entries)
        else:
            total_missing_from_statement += sum(c['amount'] for c in books_entries)
    return total_matched, total_missing_from_books, total_missing_from_statement
 def subset_match(statement_trans: List[dict], books_trans: List[dict]) -> Tuple[List[Tuple[List, List, List]], List[Dict], List[Dict]]:
    matches = []
    remaining_books_trans = []
    remaining_statement_trans = []
@ -531,7 +424,171 @@ def subset_match(statement_trans: List[dict], books_trans: List[dict]) -> Tuple[
    return matches, remaining_statement_trans, remaining_books_trans
 # TODO: Return list of tuples (instead of list of lists).
 def format_matches(matches: List, csv_statement: str, show_reconciled_matches: bool) -> List[List]:
    """Produce a list of body output lines from the given matches.
    The first column is a date so we can re-sort the list to put the
    missing entries in the right place. The second column is the text
    output.
    """
    match_output = []
    for r1s, r2s, note in matches:
        note = ', '.join(note)
        note = ': ' + note if note else note
        if r1s and r2s:
            if show_reconciled_matches or not all(x['bank_statement'] for x in r2s):
                if len(r2s) == 1:
                    entry = [r1s[0]['date'], f'{format_record(r1s[0])}  →  {format_record(r2s[0])}  ✓ Matched{note}']
                    if 'payee mismatch' in note:
                        entry[1] = Fore.YELLOW + Style.BRIGHT + entry[1] + Style.RESET_ALL
                    match_output.append(entry)
                else:
                    match_output.extend(format_multirecord(r1s, r2s, note))
        elif r1s:
            match_output.append([r1s[0]['date'], Fore.RED + Style.BRIGHT + f'{format_record(r1s[0])}  →  {" ":^59}  ✗ NOT IN BOOKS ({os.path.basename(csv_statement)}:{r1s[0]["line"]})' + Style.RESET_ALL])
        else:
            match_output.append([r2s[0]['date'], Fore.RED + Style.BRIGHT + f'{" ":^59}  →  {format_record(r2s[0])}  ✗ NOT ON STATEMENT ({os.path.basename(r2s[0]["filename"])}:{r2s[0]["line"]})' + Style.RESET_ALL])
    return match_output
 def date_proximity(d1: datetime.date, d2: datetime.date) -> float:
    """Scores two days based on how close they are together."""
    ZERO_CUTOFF = 60  # Score will be zero for this many days apart.
    diff = abs(int((d1 - d2).days))
    if diff >= ZERO_CUTOFF:
        return 0.0
    else:
        return 1.0 - (diff / ZERO_CUTOFF)
 def metadata_for_match(match: Tuple[List, List, List], statement_filename: str, csv_filename: str) -> List[Tuple[str, int, str]]:
    """Returns the bank-statement metadata that should be applied for a match."""
    # TODO: Our data structure would allow multiple statement entries
    # for a match, but would this ever make sense? Probably not.
    statement_filename = get_repo_relative_path(statement_filename)
    csv_filename = get_repo_relative_path(csv_filename)
    metadata = []
    statement_entries, books_entries, _ = match
    for books_entry in books_entries:
        for statement_entry in statement_entries:
            if not books_entry['bank_statement']:
                metadata.append((books_entry['filename'], books_entry['line'], f'    bank-statement: "{statement_filename}"'))
                metadata.append((books_entry['filename'], books_entry['line'], f'    bank-statement-csv: "{csv_filename}:{statement_entry["line"]}"'))
    return metadata
 def write_metadata_to_books(metadata_to_apply: List[Tuple[str, int, str]]) -> None:
    """Insert reconciliation metadata in the books files.
    Takes a list of edits to make as tuples of form (filename, lineno, metadata):
    [
        ('2021/main.beancount', 4245, '    bank-statement: statement.pdf'),
        ('2021/main.beancount', 1057, '    bank-statement: statement.pdf'),
        ('2021/payroll.beancount', 257, '    bank-statement: statement.pdf'),
        ...,
    ]
    Beancount doesn't provide any infrastructure for programmatically
    updating the books, only appending in the case of importers. So
    we're on our own here.
    """
    file_contents: dict[str, list] = {}
    file_offsets: dict[str, int] = collections.defaultdict(int)
    # Load each books file into memory and insert the relevant metadata lines.
    # Line numbers change as we do this, so we keep track of the offset for each
    # file. Changes must be sorted by line number first or else the offsets will
    # break because we're jumping around making edits.
    for filename, line, metadata in sorted(metadata_to_apply):
        if filename not in file_contents:
            with open(filename, 'r') as f:
                file_contents[filename] = f.readlines()
        # Insert is inefficient, but fast enough for now in practise.
        file_contents[filename].insert(line + file_offsets[filename], metadata.rstrip() + '\n')
        file_offsets[filename] += 1
    # Writes each updated file back to disk.
    for filename, contents in file_contents.items():
        with open(filename, 'w') as f:
            f.writelines(contents)
            print(f'Wrote {filename}.')
 def get_repo_relative_path(path: str) -> str:
    """Chop off the unique per-person CONSERVANCY_REPOSITORY.
    CSV and PDF statement metadata should be relative to
    CONSERVANCY_REPOSITORY ie. without regards to exactly where on
    your computer all the files live.
    """
    return os.path.relpath(path, start=os.getenv('CONSERVANCY_REPOSITORY'))
 def parse_path(path: str) -> str:
    """Validate that a file exists for use in argparse."""
    if not os.path.exists(path):
        raise argparse.ArgumentTypeError(f'File {path} does not exist.')
    return path
 def parse_repo_relative_path(path: str) -> str:
    """Validate that a file exists and is within $CONSERVANCY_REPOSITORY.
    For use with argparse.
    """
    if not os.path.exists(path):
        raise argparse.ArgumentTypeError(f'File {path} does not exist.')
    repo = os.getenv('CONSERVANCY_REPOSITORY')
    if not repo:
        raise argparse.ArgumentTypeError('$CONSERVANCY_REPOSITORY is not set.')
    if not path.startswith(repo):
        raise argparse.ArgumentTypeError(f'File {path} does not share a common prefix with $CONSERVANCY_REPOSITORY {repo}.')
    return path
 def parse_decimal_with_separator(number_text: str) -> decimal.Decimal:
    """decimal.Decimal can't parse numbers with thousands separator."""
    number_text = number_text.replace(',', '')
    return decimal.Decimal(number_text)
 def parse_arguments(argv: List[str]) -> argparse.Namespace:
    parser = argparse.ArgumentParser(prog=PROGNAME, description='Reconciliation helper')
    cliutil.add_version_argument(parser)
    cliutil.add_loglevel_argument(parser)
    parser.add_argument('--beancount-file', required=True, type=parse_path)
    parser.add_argument('--csv-statement', required=True, type=parse_repo_relative_path)
    parser.add_argument('--bank-statement', required=True, type=parse_repo_relative_path)
    parser.add_argument('--account', required=True, help='eg. Liabilities:CreditCard:AMEX')
    # parser.add_argument('--report-group-regex')
    parser.add_argument('--show-reconciled-matches', action='store_true')
    parser.add_argument('--non-interactive', action='store_true', help="Don't prompt to write to the books")    # parser.add_argument('--statement-balance', type=parse_decimal_with_separator, required=True, help="A.K.A \"cleared balance\" taken from the end of the period on the PDF statement. Required because CSV statements don't include final or running totals")
    args = parser.parse_args(args=argv)
    return args
 def totals(matches: List[Tuple[List, List, List]]) -> Tuple[decimal.Decimal, decimal.Decimal, decimal.Decimal]:
    """Calculate the totals of transactions matched/not-matched."""
    total_matched = decimal.Decimal(0)
    total_missing_from_books = decimal.Decimal(0)
    total_missing_from_statement = decimal.Decimal(0)
    for statement_entries, books_entries, _ in matches:
        if statement_entries and books_entries:
            total_matched += sum(c['amount'] for c in statement_entries)
        elif statement_entries:
            total_missing_from_books += sum(c['amount'] for c in statement_entries)
        else:
            total_missing_from_statement += sum(c['amount'] for c in books_entries)
    return total_matched, total_missing_from_books, total_missing_from_statement
 def process_unmatched(statement_trans: List[dict], books_trans: List[dict]) -> List[Tuple[List, List, List]]:
    """Format the remaining unmatched transactions to be added to one single list of matches."""
    matches: List[Tuple[List, List, List]] = []
    for r1 in statement_trans:
        matches.append(([r1], [], ['no match']))
@ -551,11 +608,8 @@ def main(arglist: Optional[Sequence[str]] = None,
        config = configmod.Config()
        config.load_file()
-    # TODO: Should put in a sanity check to make sure the statement you're feeding
+    # Validate and normalise the statement into our standard
-    # in matches the account you've provided.
+    # transaction data structure.
    # TODO: Can we open the files first, then pass the streams on to the rest of the program?
    if 'AMEX' in args.account:
        validate_csv = validate_amex_csv
        standardize_statement_record = standardize_amex_record
@ -569,40 +623,43 @@ def main(arglist: Optional[Sequence[str]] = None,
        f.seek(0)
        statement_trans = read_transactions_from_csv(f, standardize_statement_record)
    # Dates are taken from the beginning/end of the statement.
    begin_date = statement_trans[0]['date']
    end_date = statement_trans[-1]['date']
-    # Do we traverse and filter the in-memory entries list and filter that, or do we
+    # Query for the Beancount books data for this above period.
-    # use Beancount Query Language (BQL) to get a list of transactions? Currently
+    #
-    # using BQL.
+    # There are pros and cons for using Beancount's in-memory entries
    # list directly and also for using Beancount Query Language (BQL)
    # to get a list of transactions? Using BQL because it's
    # convenient, but we don't have access to the full transaction
    # entry objects. Feels a bit strange that these approaches are so
    # disconnected.
    #
    # beancount.query.query_compile.compile() and
    # beancount.query.query_execute.filter_entries() look useful in this respect,
    # but I'm not clear on how to use compile(). An example would help.
    entries, _, options = loader.load_file(args.beancount_file)
    # books_balance_query = f"""SELECT sum(COST(position)) AS aa WHERE account = "{args.account}"
    #     AND date <= {end_date.isoformat()}"""
    # _, result_rows = run_query(entries, options, books_balance_query, numberify=True)
    # books_balance = result_rows[0][0] if result_rows else 0
    # String concatenation looks bad, but there's no SQL injection possible here
    # because BQL can't write back to the Beancount files. I hope!
    query = f'SELECT filename, META("lineno") AS line, META("bank-statement") AS bank_statement, date, number(cost(position)), payee, ENTRY_META("entity") as entity, ANY_META("check-id") as check_id, narration where account = "{args.account}" and date >= {begin_date} and date <= {end_date}'
    _, result_rows = run_query(entries, options, query)
    books_trans = sort_records([standardize_beancount_record(row) for row in result_rows])
    # Apply two passes of matching, one for standard matches and one
    # for subset matches.
    matches, remaining_statement_trans, remaining_books_trans = match_statement_and_books(statement_trans, books_trans)
-    subset_matches, remaining_statement_trans, remaining_books_trans = subset_match(remaining_statement_trans, remaining_books_trans)
+    subset_matches, remaining_statement_trans, remaining_books_trans = subset_match(
        remaining_statement_trans, remaining_books_trans)
    matches.extend(subset_matches)
    # Add the remaining unmatched to make one big list of matches, successful or not.
    unmatched = process_unmatched(remaining_statement_trans, remaining_books_trans)
    matches.extend(unmatched)
    # Print out results of our matching.
    match_output = format_matches(matches, args.csv_statement, args.show_reconciled_matches)
    _, total_missing_from_books, total_missing_from_statement = totals(matches)
    print('-' * 155)
    statement_heading = f'Statement transactions {begin_date} to {end_date}'
    print(f'{statement_heading:<52}            {"Books transactions":<58}   Notes')
@ -615,7 +672,7 @@ def main(arglist: Optional[Sequence[str]] = None,
    print(f'Total:                      {total_missing_from_statement + total_missing_from_books:12,.2f}')
    print('-' * 155)
-    # Write statement metadata back to books
+    # Write statement metadata back to the books.
    metadata_to_apply = []
    for match in matches:
        metadata_to_apply.extend(metadata_for_match(match, args.bank_statement, args.csv_statement))