755 lines
30 KiB
Python
755 lines
30 KiB
Python
"""Compare a bank CSV statement with the books.
|
|
|
|
This tool takes an AMEX or First Republic CSV statement file and compares it
|
|
line-by-line with the Beancount books to make sure that everything matches. This
|
|
is designed for situations where transactions are entered into the books
|
|
directly, rather than being imported from a statement after the fact.
|
|
|
|
The reconciler will attempt to match transactions based on date, amount, check
|
|
number and payee, but is forgiving to differences in dates, the absensce of
|
|
check number and inexact matches on payee. Matches are ranked, so where there is
|
|
only one decent match for an amount/date this is accepted, but if there are
|
|
multiple similar candidates it will refuse to guess.
|
|
|
|
The reconciler will also attempt to identify where a single statement entry has
|
|
been split out into multiple Beancount postings, such as a single bank transfer
|
|
representing health insurance for multiple employees.
|
|
|
|
Run it like this:
|
|
|
|
$ statement_reconciler \
|
|
--beancount-file 2021.beancount \
|
|
--account Liabilities:CreditCard:AMEX \
|
|
--csv-statement ~/svn/2021-09-10_AMEX_activity.csv \
|
|
--bank-statement ~/svn/2021-09-10_AMEX_activity.pdf
|
|
|
|
Background:
|
|
|
|
Regular Beancount users often write automated importers to create bookkeeping
|
|
entries direct from a bank statement or similar. That combines data entry and
|
|
reconciliation in one step. Conservancy uses a different approach; they manually
|
|
entering transactions and reconciling them later on. This workflow is helpful in
|
|
cases like writing checks (see below). This is the workflow implented by this
|
|
tool.
|
|
|
|
That said, this tool *is* still somewhat like an importer in that it needs to
|
|
extract transaction details from a third-party statement. Instead of creating
|
|
directives, it just checks to see that similar directives are already
|
|
present. This is a bit like diff-ing a statement with the books (though we're
|
|
only interested in the presence of lines, not so much their order).
|
|
|
|
Paper checks are entered into the books when written (a.k.a. "posted"), but may
|
|
not be cashed until months later sometimes causing reconciliation differences
|
|
that live beyond a month. It's worth noting that there are really two dates here
|
|
- the posting date and the cleared date. Beancount only allows us to model one,
|
|
which is why carrying these reconciliation differences between months feels a
|
|
bit awkward.
|
|
|
|
Problems in scope:
|
|
|
|
- errors in the books take hours to find during reconciliation, requiring
|
|
manually comparing statements and the books and are succeptible to mistakes,
|
|
such as not noticing when there are two payments for the same amount on the
|
|
statement, but not in the books (as Bradley likes to quote, "you're entering
|
|
a world of pain")
|
|
|
|
- adding statement/reconciliation metadata to books is/was manual and prone to
|
|
mistakes
|
|
|
|
- jumping to an individual transaction in a large ledger isn't trivial - Emacs
|
|
grep mode is the current best option
|
|
|
|
- not all staff use Emacs
|
|
|
|
- auditors would prefer Bradley didn't perform reconciliation, ideally not
|
|
Rosanne either
|
|
|
|
- reconciliation reports are created by hand when there are mismatches
|
|
|
|
Other related problems we're not dealing with here:
|
|
|
|
- after updates to the books files, beancount must be restarted to reflect
|
|
updates
|
|
|
|
- updates also invalidate the cache meaning restart takes several minutes
|
|
|
|
- balance checks are manually updated in
|
|
svn/Financial/Ledger/sanity-check-balances.yaml
|
|
|
|
- transactions are entered manually and reconciled after the fact, but
|
|
importing from statements may be useful in some cases
|
|
|
|
Current issue:
|
|
|
|
- entry_point seems to swallow errors, meaning you get a fairly unhelpful
|
|
message if there's an unhandled error
|
|
|
|
Future possibilities:
|
|
|
|
- allow the reconciler to respect manually-applied metadata - not clear how
|
|
this would work exactly
|
|
|
|
- allow interactive matching where the user can specifiy a match
|
|
|
|
- consider combining this with helper.py into one more complete tool that both
|
|
reconciles and summarises the unreconciled transactions
|
|
"""
|
|
|
|
import argparse
|
|
import collections
|
|
import copy
|
|
import csv
|
|
import datetime
|
|
import decimal
|
|
import io
|
|
import itertools
|
|
import logging
|
|
import os
|
|
import re
|
|
import sys
|
|
from typing import Dict, List, Optional, Sequence, Tuple, TextIO
|
|
|
|
from beancount import loader
|
|
from beancount.query.query import run_query
|
|
from colorama import Fore, Style # type: ignore
|
|
|
|
from .. import cliutil
|
|
from .. import config as configmod
|
|
|
|
if not sys.warnoptions:
|
|
import warnings
|
|
# Disable annoying warning from thefuzz prompting for a C extension. The
|
|
# current pure-Python implementation isn't a bottleneck for us.
|
|
warnings.filterwarnings('ignore', category=UserWarning, module='thefuzz.fuzz')
|
|
from thefuzz import fuzz # type: ignore
|
|
|
|
PROGNAME = 'reconcile-statement'
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Get some interesting feedback on call to RT with this:
|
|
# logger.setLevel(logging.DEBUG)
|
|
# logger.addHandler(logging.StreamHandler())
|
|
|
|
JUNK_WORDS = [
|
|
'software',
|
|
'freedom',
|
|
'conservancy',
|
|
'conse',
|
|
'payment',
|
|
'echeck',
|
|
'bill',
|
|
'debit',
|
|
'wire',
|
|
'credit',
|
|
"int'l",
|
|
"in.l",
|
|
'llc',
|
|
'online',
|
|
'donation',
|
|
'usd',
|
|
'inc',
|
|
]
|
|
JUNK_WORDS_RES = [re.compile(word, re.IGNORECASE) for word in JUNK_WORDS]
|
|
ZERO_RE = re.compile('^0+')
|
|
FULL_MATCH_THRESHOLD = 0.8
|
|
PARTIAL_MATCH_THRESHOLD = 0.4
|
|
|
|
|
|
def remove_duplicate_words(text: str) -> str:
|
|
unique_words = []
|
|
known_words = set()
|
|
for word in text.split():
|
|
if word.lower() not in known_words:
|
|
unique_words.append(word)
|
|
known_words.add(word.lower())
|
|
return ' '.join(unique_words)
|
|
|
|
|
|
def remove_payee_junk(payee: str) -> str:
|
|
"""Clean up payee field to improve quality of fuzzy matching.
|
|
|
|
It turns out that bank statement "description" fields are
|
|
difficult to fuzzy match on because they're long and
|
|
noisey. Truncating them (see standardize_XXX_record fns) and
|
|
removing the common junk helps significantly.
|
|
|
|
"""
|
|
for r in JUNK_WORDS_RES:
|
|
payee = r.sub('', payee)
|
|
payee = ZERO_RE.sub('', payee)
|
|
payee = payee.replace(' - ', ' ')
|
|
payee = re.sub(r'\.0\.\d+', ' ', payee)
|
|
payee = payee.replace('.0', ' ')
|
|
payee = payee.replace('/', ' ')
|
|
payee = re.sub(re.escape('.com'), ' ', payee, flags=re.IGNORECASE)
|
|
payee = re.sub(re.escape('.net'), ' ', payee, flags=re.IGNORECASE)
|
|
payee = payee.replace('*', ' ')
|
|
payee = ' '.join([i for i in payee.split(' ') if len(i) > 2])
|
|
payee = payee.replace('-', ' ')
|
|
payee = remove_duplicate_words(payee)
|
|
payee.strip()
|
|
return payee
|
|
|
|
|
|
def parse_amount(amount: str) -> decimal.Decimal:
|
|
"""Parse amounts and handle comma separators as seen in some FR statements."""
|
|
return decimal.Decimal(amount.replace('$', '').replace(',', ''))
|
|
|
|
|
|
def validate_amex_csv(sample: str) -> None:
|
|
required_cols = {'Date', 'Amount', 'Description', 'Card Member'}
|
|
reader = csv.DictReader(io.StringIO(sample))
|
|
if reader.fieldnames and not required_cols.issubset(reader.fieldnames):
|
|
sys.exit(f"This AMEX CSV doesn't seem to have the columns we're expecting, including: {', '.join(required_cols)}. Please use an unmodified statement direct from the institution.")
|
|
|
|
|
|
def standardize_amex_record(row: Dict, line: int) -> Dict:
|
|
"""Turn an AMEX CSV row into a standard dict format representing a transaction."""
|
|
# NOTE: Statement doesn't seem to give us a running balance or a final total.
|
|
return {
|
|
'date': datetime.datetime.strptime(row['Date'], '%m/%d/%Y').date(),
|
|
'amount': -1 * parse_amount(row['Amount']),
|
|
# Descriptions have too much noise, so taking just the start
|
|
# significantly assists the fuzzy matching.
|
|
'payee': remove_payee_junk(row['Description'] or '')[:20],
|
|
'check_id': '',
|
|
'line': line,
|
|
}
|
|
|
|
|
|
def read_amex_csv(f: TextIO) -> list:
|
|
reader = csv.DictReader(f)
|
|
# The reader.line_num is the source line number, not the spreadsheet row
|
|
# number due to multi-line records.
|
|
return sort_records([standardize_amex_record(row, i) for i, row in enumerate(reader, 2)])
|
|
|
|
|
|
def validate_fr_csv(sample: str) -> None:
|
|
# No column headers in FR statements
|
|
reader = csv.reader(io.StringIO(sample))
|
|
next(reader) # First row is previous statement ending balance
|
|
row = next(reader)
|
|
date = None
|
|
try:
|
|
date = datetime.datetime.strptime(row[1], '%m/%d/%Y')
|
|
except ValueError:
|
|
pass
|
|
amount_found = '$' in row[4] and '$' in row[5]
|
|
if len(row) != 6 or not date or not amount_found:
|
|
sys.exit("This First Republic CSV doesn't seem to have the 6 columns we're expecting, including a date in column 2 and an amount in columns 5 and 6. Please use an unmodified statement direct from the institution.")
|
|
|
|
|
|
def standardize_fr_record(line, row):
|
|
record = {
|
|
'date': datetime.datetime.strptime(row[1], '%m/%d/%Y').date(),
|
|
'amount': parse_amount(row[4]),
|
|
'payee': remove_payee_junk(row[3] or '')[:20],
|
|
'check_id': row[2].replace('CHECK ', '') if 'CHECK ' in row[2] else '',
|
|
'line': line,
|
|
}
|
|
return record
|
|
|
|
|
|
def read_fr_csv(f: TextIO) -> list:
|
|
reader = csv.reader(f)
|
|
# The reader.line_num is the source line number, not the spreadsheet row
|
|
# number due to multi-line records.
|
|
return sort_records(
|
|
standardize_fr_record(i, row) for i, row in enumerate(reader, 1)
|
|
if len(row) == 6 and row[2] not in {'LAST STATEMENT', 'THIS STATEMENT'}
|
|
)
|
|
|
|
|
|
def standardize_beancount_record(row) -> Dict: # type: ignore[no-untyped-def]
|
|
"""Turn a Beancount query result row into a standard dict representing a transaction."""
|
|
return {
|
|
'date': row.date,
|
|
'amount': row.number_cost_position,
|
|
'payee': remove_payee_junk(f'{row.payee or ""} {row.entity or ""} {row.narration or ""}'),
|
|
'check_id': str(row.check_id or ''),
|
|
'filename': row.filename,
|
|
'line': row.line,
|
|
'bank_statement': row.bank_statement,
|
|
}
|
|
|
|
|
|
def format_record(record: dict) -> str:
|
|
"""Generate output lines for a standard 1:1 match."""
|
|
if record['payee'] and record['check_id']:
|
|
output = f"{record['date'].isoformat()}: {record['amount']:11,.2f} {record['payee'][:25]} #{record['check_id']}".ljust(59)
|
|
elif record['payee']:
|
|
output = f"{record['date'].isoformat()}: {record['amount']:11,.2f} {record['payee'][:35]}".ljust(59)
|
|
else:
|
|
output = f"{record['date'].isoformat()}: {record['amount']:11,.2f} #{record['check_id']}".ljust(59)
|
|
return output
|
|
|
|
|
|
def format_multirecord(r1s: List[dict], r2s: List[dict], note: str) -> List[list]:
|
|
"""Generates output lines for one statement:multiple books transaction match."""
|
|
assert len(r1s) == 1
|
|
assert len(r2s) > 1
|
|
match_output = []
|
|
match_output.append([r1s[0]['date'], f'{format_record(r1s[0])} → {format_record(r2s[0])} ✓ Matched{note}'])
|
|
for r2 in r2s[1:]:
|
|
match_output.append([r1s[0]['date'], f'{r1s[0]["date"].isoformat()}: ↳ → {format_record(r2)} ✓ Matched{note}'])
|
|
return match_output
|
|
|
|
|
|
def _start_of_month(time, offset_months=0):
|
|
if offset_months > 0:
|
|
return _start_of_month(time.replace(day=28) + datetime.timedelta(days=4), offset_months - 1)
|
|
else:
|
|
return time.replace(day=1)
|
|
|
|
|
|
def round_to_month(begin_date, end_date):
|
|
"""Round a beginning and end date to beginning and end of months respectively."""
|
|
return (
|
|
_start_of_month(begin_date),
|
|
_start_of_month(end_date, offset_months=1) - datetime.timedelta(days=1))
|
|
|
|
|
|
def sort_records(records: List) -> List:
|
|
return sorted(records, key=lambda x: (x['date'], x['amount']))
|
|
|
|
|
|
def first_word_exact_match(a: str, b: str) -> float:
|
|
"""Score a payee match based first word.
|
|
|
|
We get a whole lot of good matches this way. Helps in the
|
|
situation where the first word or two of a transaction description
|
|
is useful and the rest is garbage.
|
|
|
|
"""
|
|
if len(a) == 0 or len(b) == 0:
|
|
return 0.0
|
|
first_a = a.split()[0].strip()
|
|
first_b = b.split()[0].strip()
|
|
if first_a.casefold() == first_b.casefold():
|
|
return min(1.0, 0.2 * len(first_a))
|
|
else:
|
|
return 0.0
|
|
|
|
|
|
def payee_match(a: str, b: str) -> float:
|
|
"""Score a match between two payees."""
|
|
fuzzy_match = float(fuzz.token_set_ratio(a, b) / 100.00)
|
|
first_word_match = first_word_exact_match(a, b)
|
|
return max(fuzzy_match, first_word_match)
|
|
|
|
|
|
def records_match(r1: Dict, r2: Dict) -> Tuple[float, List[str]]:
|
|
"""Do these records represent the same transaction?"""
|
|
date_score = date_proximity(r1['date'], r2['date'])
|
|
if r1['date'] == r2['date']:
|
|
date_message = ''
|
|
elif date_score > 0.0:
|
|
diff = abs((r1['date'] - r2['date']).days)
|
|
date_message = f'+/- {diff} days'
|
|
else:
|
|
date_message = 'date mismatch'
|
|
|
|
if r1['amount'] == r2['amount']:
|
|
amount_score, amount_message = 2.0, ''
|
|
else:
|
|
amount_score, amount_message = 0.0, 'amount mismatch'
|
|
|
|
# We never consider payee if there's a check_id in the books.
|
|
check_message = ''
|
|
payee_message = ''
|
|
# Sometimes we get unrelated numbers in the statement column with check-ids,
|
|
# so we can't match based on the existence of a statement check-id.
|
|
if r2['check_id']:
|
|
payee_score = 0.0
|
|
if r1['check_id'] and r2['check_id'] and r1['check_id'] == r2['check_id']:
|
|
check_score = 1.0
|
|
else:
|
|
check_message = 'check-id mismatch'
|
|
check_score = 0.0
|
|
else:
|
|
check_score = 0.0
|
|
payee_score = payee_match(r1['payee'], r2['payee'])
|
|
if payee_score > FULL_MATCH_THRESHOLD:
|
|
payee_message = ''
|
|
elif payee_score > PARTIAL_MATCH_THRESHOLD:
|
|
payee_message = 'partial payee match'
|
|
else:
|
|
payee_message = 'payee mismatch'
|
|
|
|
overall_score = (date_score + amount_score + check_score + payee_score) / 4
|
|
overall_message = [m for m in [date_message, amount_message, check_message, payee_message] if m]
|
|
return overall_score, overall_message
|
|
|
|
|
|
def match_statement_and_books(statement_trans: List[Dict], books_trans: List[Dict]) -> Tuple[List[Tuple[List, List, List]], List[Dict], List[Dict]]:
|
|
"""Match transactions between the statement and books.
|
|
|
|
If matched, the books transaction is marked off so that it can
|
|
only be matched once. Some transactions will be matched, some will
|
|
be on the statement but not the books and some on the books but
|
|
not the statement.
|
|
|
|
Passes through any unmatched transactions.
|
|
|
|
Currently we use the same matching logic for all types of
|
|
statements. It's conceivable that you could have special cases to
|
|
accurately match some types of statements, but that would be more
|
|
work to maintain and test.
|
|
|
|
"""
|
|
matches = []
|
|
remaining_books_trans = []
|
|
remaining_statement_trans = []
|
|
|
|
for r1 in statement_trans:
|
|
best_match_score = 0.0
|
|
best_match_index = None
|
|
best_match_note = []
|
|
matches_found = 0
|
|
for i, r2 in enumerate(books_trans):
|
|
score, note = records_match(r1, r2)
|
|
if score >= 0.5 and score >= best_match_score:
|
|
matches_found += 1
|
|
best_match_score = score
|
|
best_match_index = i
|
|
best_match_note = note
|
|
if best_match_score > 0.5 and matches_found == 1 and 'check-id mismatch' not in best_match_note or best_match_score > 0.8:
|
|
matches.append(([r1], [books_trans[best_match_index]], best_match_note))
|
|
# Don't try to make a second match against this books entry.
|
|
if best_match_index is not None:
|
|
del books_trans[best_match_index]
|
|
else:
|
|
remaining_statement_trans.append(r1)
|
|
for r2 in books_trans:
|
|
remaining_books_trans.append(r2)
|
|
return matches, remaining_statement_trans, remaining_books_trans
|
|
|
|
|
|
def subset_match(statement_trans: List[dict], books_trans: List[dict]) -> Tuple[List[Tuple[List, List, List]], List[Dict], List[Dict]]:
|
|
"""Match single statement transactions with multiple books transactions.
|
|
|
|
Works similarly to match_statement_and_books in that it returns a
|
|
list of matches and lists of remaining statement and books
|
|
transactions.
|
|
|
|
"""
|
|
matches = []
|
|
remaining_books_trans = []
|
|
remaining_statement_trans = []
|
|
|
|
groups = itertools.groupby(books_trans, key=lambda x: (x['date'], x['payee']))
|
|
for _, group in groups:
|
|
best_match_score = 0.0
|
|
best_match_index = None
|
|
best_match_note = []
|
|
matches_found = 0
|
|
|
|
group_items = list(group)
|
|
total = sum(x['amount'] for x in group_items)
|
|
r2 = copy.copy(group_items[0])
|
|
r2['amount'] = total
|
|
for i, r1 in enumerate(statement_trans):
|
|
score, note = records_match(r1, r2)
|
|
if score >= 0.5 and score >= best_match_score:
|
|
matches_found += 1
|
|
best_match_score = score
|
|
best_match_index = i
|
|
best_match_note = note
|
|
if best_match_score > 0.5 and matches_found == 1 and 'check-id mismatch' not in best_match_note or best_match_score > 0.8:
|
|
matches.append(([statement_trans[best_match_index]], group_items, best_match_note))
|
|
if best_match_index is not None:
|
|
del statement_trans[best_match_index]
|
|
else:
|
|
remaining_books_trans.append(r2)
|
|
for r1 in statement_trans:
|
|
remaining_statement_trans.append(r1)
|
|
return matches, remaining_statement_trans, remaining_books_trans
|
|
|
|
|
|
# TODO: Return list of tuples (instead of list of lists).
|
|
|
|
def format_matches(matches: List, csv_statement: str, show_reconciled_matches: bool) -> List[List]:
|
|
|
|
"""Produce a list of body output lines from the given matches.
|
|
|
|
The first column is a date so we can re-sort the list to put the
|
|
missing entries in the right place. The second column is the text
|
|
output.
|
|
|
|
"""
|
|
match_output = []
|
|
for r1s, r2s, note in matches:
|
|
note = ', '.join(note)
|
|
note = ': ' + note if note else note
|
|
if r1s and r2s:
|
|
if show_reconciled_matches or not all(x['bank_statement'] for x in r2s):
|
|
if len(r2s) == 1:
|
|
entry = [r1s[0]['date'], f'{format_record(r1s[0])} → {format_record(r2s[0])} ✓ Matched{note}']
|
|
if 'payee mismatch' in note:
|
|
entry[1] = Fore.YELLOW + Style.BRIGHT + entry[1] + Style.RESET_ALL
|
|
match_output.append(entry)
|
|
else:
|
|
match_output.extend(format_multirecord(r1s, r2s, note))
|
|
elif r1s:
|
|
match_output.append([r1s[0]['date'], Fore.RED + Style.BRIGHT + f'{format_record(r1s[0])} → {" ":^59} ✗ NOT IN BOOKS ({os.path.basename(csv_statement)}:{r1s[0]["line"]})' + Style.RESET_ALL])
|
|
else:
|
|
match_output.append([r2s[0]['date'], Fore.RED + Style.BRIGHT + f'{" ":^59} → {format_record(r2s[0])} ✗ NOT ON STATEMENT ({os.path.basename(r2s[0]["filename"])}:{r2s[0]["line"]})' + Style.RESET_ALL])
|
|
return match_output
|
|
|
|
|
|
def date_proximity(d1: datetime.date, d2: datetime.date) -> float:
|
|
"""Scores two days based on how close they are together."""
|
|
ZERO_CUTOFF = 60 # Score will be zero for this many days apart.
|
|
diff = abs(int((d1 - d2).days))
|
|
if diff >= ZERO_CUTOFF:
|
|
return 0.0
|
|
else:
|
|
return 1.0 - (diff / ZERO_CUTOFF)
|
|
|
|
|
|
def metadata_for_match(match: Tuple[List, List, List], statement_filename: str, csv_filename: str) -> List[Tuple[str, int, str]]:
|
|
"""Returns the bank-statement metadata that should be applied for a match."""
|
|
# TODO: Our data structure would allow multiple statement entries
|
|
# for a match, but would this ever make sense? Probably not.
|
|
statement_filename = get_repo_relative_path(statement_filename)
|
|
csv_filename = get_repo_relative_path(csv_filename)
|
|
metadata = []
|
|
statement_entries, books_entries, _ = match
|
|
for books_entry in books_entries:
|
|
for statement_entry in statement_entries:
|
|
if not books_entry['bank_statement']:
|
|
metadata.append((books_entry['filename'], books_entry['line'], f' bank-statement: "{statement_filename}"'))
|
|
metadata.append((books_entry['filename'], books_entry['line'], f' bank-statement-csv: "{csv_filename}:{statement_entry["line"]}"'))
|
|
return metadata
|
|
|
|
|
|
def write_metadata_to_books(metadata_to_apply: List[Tuple[str, int, str]]) -> None:
|
|
"""Insert reconciliation metadata in the books files.
|
|
|
|
Takes a list of edits to make as tuples of form (filename, lineno, metadata):
|
|
|
|
[
|
|
('2021/main.beancount', 4245, ' bank-statement: statement.pdf'),
|
|
('2021/main.beancount', 1057, ' bank-statement: statement.pdf'),
|
|
('2021/payroll.beancount', 257, ' bank-statement: statement.pdf'),
|
|
...,
|
|
]
|
|
|
|
Beancount doesn't provide any infrastructure for programmatically
|
|
updating the books, only appending in the case of importers. So
|
|
we're on our own here.
|
|
"""
|
|
file_contents: dict[str, list] = {}
|
|
file_offsets: dict[str, int] = collections.defaultdict(int)
|
|
# Load each books file into memory and insert the relevant metadata lines.
|
|
# Line numbers change as we do this, so we keep track of the offset for each
|
|
# file. Changes must be sorted by line number first or else the offsets will
|
|
# break because we're jumping around making edits.
|
|
for filename, line, metadata in sorted(metadata_to_apply):
|
|
if filename not in file_contents:
|
|
with open(filename, 'r') as f:
|
|
file_contents[filename] = f.readlines()
|
|
# Insert is inefficient, but fast enough for now in practise.
|
|
file_contents[filename].insert(line + file_offsets[filename], metadata.rstrip() + '\n')
|
|
file_offsets[filename] += 1
|
|
# Writes each updated file back to disk.
|
|
for filename, contents in file_contents.items():
|
|
with open(filename, 'w') as f:
|
|
f.writelines(contents)
|
|
print(f'Wrote {filename}.')
|
|
|
|
|
|
def get_repo_relative_path(path: str) -> str:
|
|
"""Chop off the unique per-person CONSERVANCY_REPOSITORY.
|
|
|
|
CSV and PDF statement metadata should be relative to
|
|
CONSERVANCY_REPOSITORY ie. without regards to exactly where on
|
|
your computer all the files live.
|
|
|
|
"""
|
|
return os.path.relpath(path, start=os.getenv('CONSERVANCY_REPOSITORY'))
|
|
|
|
|
|
def parse_path(path: str) -> str:
|
|
"""Validate that a file exists for use in argparse."""
|
|
if not os.path.exists(path):
|
|
raise argparse.ArgumentTypeError(f'File {path} does not exist.')
|
|
return path
|
|
|
|
|
|
def parse_repo_relative_path(path: str) -> str:
|
|
"""Validate that a file exists and is within $CONSERVANCY_REPOSITORY.
|
|
|
|
For use with argparse.
|
|
|
|
"""
|
|
if not os.path.exists(path):
|
|
raise argparse.ArgumentTypeError(f'File {path} does not exist.')
|
|
repo = os.getenv('CONSERVANCY_REPOSITORY')
|
|
if not repo:
|
|
raise argparse.ArgumentTypeError('$CONSERVANCY_REPOSITORY is not set.')
|
|
if not path.startswith(repo):
|
|
raise argparse.ArgumentTypeError(f'File {path} does not share a common prefix with $CONSERVANCY_REPOSITORY {repo}.')
|
|
return path
|
|
|
|
|
|
def parse_decimal_with_separator(number_text: str) -> decimal.Decimal:
|
|
"""decimal.Decimal can't parse numbers with thousands separator."""
|
|
number_text = number_text.replace(',', '')
|
|
return decimal.Decimal(number_text)
|
|
|
|
|
|
def parse_arguments(argv: List[str]) -> argparse.Namespace:
|
|
parser = argparse.ArgumentParser(prog=PROGNAME, description='Reconciliation helper')
|
|
cliutil.add_version_argument(parser)
|
|
cliutil.add_loglevel_argument(parser)
|
|
parser.add_argument('--beancount-file', required=True, type=parse_path)
|
|
parser.add_argument('--csv-statement', required=True, type=parse_repo_relative_path)
|
|
parser.add_argument('--bank-statement', required=True, type=parse_repo_relative_path)
|
|
parser.add_argument('--account', required=True, help='eg. Liabilities:CreditCard:AMEX')
|
|
# parser.add_argument('--report-group-regex')
|
|
parser.add_argument('--show-reconciled-matches', action='store_true')
|
|
parser.add_argument('--non-interactive', action='store_true', help="Don't prompt to write to the books") # parser.add_argument('--statement-balance', type=parse_decimal_with_separator, required=True, help="A.K.A \"cleared balance\" taken from the end of the period on the PDF statement. Required because CSV statements don't include final or running totals")
|
|
parser.add_argument('--full-months', action='store_true', help='Match payments over the full month, rather that just between the beginning and end dates of the CSV statement')
|
|
args = parser.parse_args(args=argv)
|
|
return args
|
|
|
|
|
|
def totals(matches: List[Tuple[List, List, List]]) -> Tuple[decimal.Decimal, decimal.Decimal, decimal.Decimal]:
|
|
"""Calculate the totals of transactions matched/not-matched."""
|
|
total_matched = decimal.Decimal(0)
|
|
total_missing_from_books = decimal.Decimal(0)
|
|
total_missing_from_statement = decimal.Decimal(0)
|
|
for statement_entries, books_entries, _ in matches:
|
|
if statement_entries and books_entries:
|
|
total_matched += sum(c['amount'] for c in statement_entries)
|
|
elif statement_entries:
|
|
total_missing_from_books += sum(c['amount'] for c in statement_entries)
|
|
else:
|
|
total_missing_from_statement += sum(c['amount'] for c in books_entries)
|
|
return total_matched, total_missing_from_books, total_missing_from_statement
|
|
|
|
|
|
def process_unmatched(statement_trans: List[dict], books_trans: List[dict]) -> List[Tuple[List, List, List]]:
|
|
"""Format the remaining unmatched transactions to be added to one single list of matches."""
|
|
matches: List[Tuple[List, List, List]] = []
|
|
for r1 in statement_trans:
|
|
matches.append(([r1], [], ['no match']))
|
|
for r2 in books_trans:
|
|
matches.append(([], [r2], ['no match']))
|
|
return matches
|
|
|
|
|
|
def format_output(matches, begin_date, end_date, csv_statement, show_reconciled_matches) -> str:
|
|
with io.StringIO() as out:
|
|
match_output = format_matches(matches, csv_statement, show_reconciled_matches)
|
|
_, total_missing_from_books, total_missing_from_statement = totals(matches)
|
|
print('-' * 155, file=out)
|
|
statement_heading = f'Statement transactions {begin_date} to {end_date}'
|
|
print(f'{statement_heading:<52} {"Books transactions":<58} Notes', file=out)
|
|
print('-' * 155, file=out)
|
|
for _, output in sorted(match_output, key=lambda x: x[0]):
|
|
print(output, file=out)
|
|
print('-' * 155, file=out)
|
|
print(f'Sub-total not on statement: {total_missing_from_statement:12,.2f}', file=out)
|
|
print(f'Sub-total not in books: {total_missing_from_books:12,.2f}', file=out)
|
|
print(f'Total: {total_missing_from_statement + total_missing_from_books:12,.2f}', file=out)
|
|
print('-' * 155, file=out)
|
|
return out.getvalue()
|
|
|
|
|
|
def main(arglist: Optional[Sequence[str]] = None,
|
|
stdout: TextIO = sys.stdout,
|
|
stderr: TextIO = sys.stderr,
|
|
config: Optional[configmod.Config] = None,
|
|
) -> int:
|
|
args = parse_arguments(arglist)
|
|
cliutil.set_loglevel(logger, args.loglevel)
|
|
if config is None:
|
|
config = configmod.Config()
|
|
config.load_file()
|
|
|
|
# Validate and normalise the statement into our standard
|
|
# transaction data structure.
|
|
if 'AMEX' in args.account:
|
|
validate_csv = validate_amex_csv
|
|
read_csv = read_amex_csv
|
|
else:
|
|
validate_csv = validate_fr_csv
|
|
read_csv = read_fr_csv
|
|
|
|
with open(args.csv_statement) as f:
|
|
sample = f.read(200)
|
|
# Validate should return true/false and a message.
|
|
validate_csv(sample)
|
|
f.seek(0)
|
|
# TODO: Needs a custom read_transactions_from_csv for each of AMEX and
|
|
# FR since AMEX has a header row and FR doesn't.
|
|
statement_trans = read_csv(f)
|
|
|
|
# Dates are taken from the beginning/end of the statement.
|
|
begin_date = statement_trans[0]['date']
|
|
end_date = statement_trans[-1]['date']
|
|
|
|
if args.full_months:
|
|
begin_date, end_date = round_to_month(begin_date, end_date)
|
|
|
|
# Query for the Beancount books data for this above period.
|
|
#
|
|
# There are pros and cons for using Beancount's in-memory entries
|
|
# list directly and also for using Beancount Query Language (BQL)
|
|
# to get a list of transactions? Using BQL because it's
|
|
# convenient, but we don't have access to the full transaction
|
|
# entry objects. Feels a bit strange that these approaches are so
|
|
# disconnected.
|
|
#
|
|
# beancount.query.query_compile.compile() and
|
|
# beancount.query.query_execute.filter_entries() look useful in this respect,
|
|
# but I'm not clear on how to use compile(). An example would help.
|
|
entries, _, options = loader.load_file(args.beancount_file)
|
|
# String concatenation looks bad, but there's no SQL injection possible here
|
|
# because BQL can't write back to the Beancount files. I hope!
|
|
query = f"""
|
|
SELECT filename,
|
|
META("lineno") AS line,
|
|
META("bank-statement") AS bank_statement,
|
|
date,
|
|
number(cost(position)),
|
|
payee,
|
|
ENTRY_META("entity") as entity,
|
|
ANY_META("check-id") as check_id,
|
|
narration
|
|
WHERE account = "{args.account}"
|
|
AND date >= {begin_date}
|
|
AND date <= {end_date}"""
|
|
_, result_rows = run_query(entries, options, query)
|
|
books_trans = sort_records([standardize_beancount_record(row) for row in result_rows])
|
|
|
|
# Apply two passes of matching, one for standard matches and one
|
|
# for subset matches.
|
|
matches, remaining_statement_trans, remaining_books_trans = match_statement_and_books(statement_trans, books_trans)
|
|
subset_matches, remaining_statement_trans, remaining_books_trans = subset_match(
|
|
remaining_statement_trans, remaining_books_trans)
|
|
matches.extend(subset_matches)
|
|
|
|
# Add the remaining unmatched to make one big list of matches, successful or not.
|
|
unmatched = process_unmatched(remaining_statement_trans, remaining_books_trans)
|
|
matches.extend(unmatched)
|
|
|
|
# Print out results of our matching.
|
|
print(format_output(matches, begin_date, end_date, args.csv_statement, args.show_reconciled_matches))
|
|
|
|
# Write statement metadata back to the books.
|
|
metadata_to_apply = []
|
|
for match in matches:
|
|
metadata_to_apply.extend(metadata_for_match(match, args.bank_statement, args.csv_statement))
|
|
if metadata_to_apply and not args.non_interactive:
|
|
print('Mark matched transactions as reconciled in the books? (y/N) ', end='')
|
|
if input().lower() == 'y':
|
|
write_metadata_to_books(metadata_to_apply)
|
|
|
|
|
|
entry_point = cliutil.make_entry_point(__name__, PROGNAME)
|
|
|
|
if __name__ == '__main__':
|
|
exit(entry_point())
|