reconcile: Add further typing info; update tests.

This commit is contained in:
Ben Sturmfels 2022-02-24 22:43:37 +11:00
parent 9ae36feed2
commit 54d11f2437
Signed by: bsturmfels
GPG key ID: 023C05E2C9C068F0
2 changed files with 75 additions and 41 deletions

View file

@ -58,7 +58,7 @@ from typing import Callable, Dict, List, Tuple, TextIO
from beancount import loader from beancount import loader
from beancount.query.query import run_query from beancount.query.query import run_query
from colorama import Fore, Style from colorama import Fore, Style # type: ignore
if not sys.warnoptions: if not sys.warnoptions:
import warnings import warnings
@ -96,7 +96,7 @@ JUNK_WORDS = [
JUNK_WORDS_RES = [re.compile(word, re.IGNORECASE) for word in JUNK_WORDS] JUNK_WORDS_RES = [re.compile(word, re.IGNORECASE) for word in JUNK_WORDS]
ZERO_RE = re.compile('^0+') ZERO_RE = re.compile('^0+')
def remove_duplicate_words(text): def remove_duplicate_words(text: str) -> str:
unique_words = [] unique_words = []
known_words = set() known_words = set()
for word in text.split(): for word in text.split():
@ -123,18 +123,19 @@ def remove_payee_junk(payee: str) -> str:
payee.strip() payee.strip()
return payee return payee
# NOTE: Statement doesn't seem to give us a running balance or a final total.
def read_transactions_from_csv(f: TextIO, standardize_statement_record: Callable) -> list: def read_transactions_from_csv(f: TextIO, standardize_statement_record: Callable) -> list:
reader = csv.DictReader(f) reader = csv.DictReader(f)
return sort_records([standardize_statement_record(row, reader.line_num) for row in reader]) # The reader.line_num is the source line number, not the spreadsheet row
# number due to multi-line records.
return sort_records([standardize_statement_record(row, i) for i, row in enumerate(reader, 2)])
# Does the account you entered match the CSV?
# Is the CSV in the format we expect? (ie. did they download through the right interface?) # NOTE: Statement doesn't seem to give us a running balance or a final total.
# Logical CSV line numbers # CSV reconciliation report.
# CSV reconciliation report
# Merge helper script. # Merge helper script.
def standardize_amex_record(row: Dict, line: int) -> Dict: def standardize_amex_record(row: Dict, line: int) -> Dict:
"""Turn an AMEX CSV row into a standard dict format representing a transaction.""" """Turn an AMEX CSV row into a standard dict format representing a transaction."""
return { return {
@ -148,6 +149,20 @@ def standardize_amex_record(row: Dict, line: int) -> Dict:
} }
def validate_amex_csv(sample: str, account: str) -> None:
required_cols = {'Date', 'Amount', 'Description', 'Card Member'}
reader = csv.DictReader(io.StringIO(sample))
if reader.fieldnames and not required_cols.issubset(reader.fieldnames):
sys.exit(f"This CSV doesn't seem to have the columns we're expecting, including: {', '.join(required_cols)}")
def validate_fr_csv(sample: str, account: str) -> None:
required_cols = {'Date', 'Amount', 'Detail', 'Serial Num'}
reader = csv.DictReader(io.StringIO(sample))
if reader.fieldnames and not required_cols.issubset(reader.fieldnames):
sys.exit(f"This CSV doesn't seem to have the columns we're expecting, including: {', '.join(required_cols)}")
def standardize_fr_record(row: Dict, line: int) -> Dict: def standardize_fr_record(row: Dict, line: int) -> Dict:
return { return {
'date': datetime.datetime.strptime(row['Date'], '%m/%d/%Y').date(), 'date': datetime.datetime.strptime(row['Date'], '%m/%d/%Y').date(),
@ -181,7 +196,7 @@ def format_record(record: dict) -> str:
return output return output
def format_multirecord(r1s, r2s, note): def format_multirecord(r1s: list[dict], r2s: list[dict], note: str) -> list[list]:
total = sum(x['amount'] for x in r2s) total = sum(x['amount'] for x in r2s)
assert len(r1s) == 1 assert len(r1s) == 1
assert len(r2s) > 1 assert len(r2s) > 1
@ -191,26 +206,29 @@ def format_multirecord(r1s, r2s, note):
match_output.append([r1s[0]['date'], f'{r1s[0]["date"].isoformat()}: ↳ → {format_record(r2)} ✓ Matched{note}']) match_output.append([r1s[0]['date'], f'{r1s[0]["date"].isoformat()}: ↳ → {format_record(r2)} ✓ Matched{note}'])
return match_output return match_output
def sort_records(records: List) -> List: def sort_records(records: List) -> List:
return sorted(records, key=lambda x: (x['date'], x['amount'])) return sorted(records, key=lambda x: (x['date'], x['amount']))
def first_word_exact_match(a, b): def first_word_exact_match(a: str, b: str) -> float:
if len(a) == 0 or len(b) == 0: if len(a) == 0 or len(b) == 0:
return 0 return 0.0
first_a = a.split()[0].strip() first_a = a.split()[0].strip()
first_b = b.split()[0].strip() first_b = b.split()[0].strip()
if first_a.casefold() == first_b.casefold(): if first_a.casefold() == first_b.casefold():
return min(1.0, 0.2 * len(first_a)) return min(1.0, 0.2 * len(first_a))
else: else:
return 0; return 0.0;
def payee_match(a, b):
fuzzy_match = fuzz.token_set_ratio(a, b) / 100.00 def payee_match(a: str, b: str) -> float:
fuzzy_match = float(fuzz.token_set_ratio(a, b) / 100.00)
first_word_match = first_word_exact_match(a, b) first_word_match = first_word_exact_match(a, b)
return max(fuzzy_match, first_word_match) return max(fuzzy_match, first_word_match)
def records_match(r1: Dict, r2: Dict) -> Tuple[bool, str]:
def records_match(r1: Dict, r2: Dict) -> Tuple[float, List[str]]:
"""Do these records represent the same transaction?""" """Do these records represent the same transaction?"""
date_score = date_proximity(r1['date'], r2['date']) date_score = date_proximity(r1['date'], r2['date'])
@ -254,7 +272,7 @@ def records_match(r1: Dict, r2: Dict) -> Tuple[bool, str]:
return overall_score, overall_message return overall_score, overall_message
def match_statement_and_books(statement_trans: list, books_trans: list): def match_statement_and_books(statement_trans: List[Dict], books_trans: List[Dict]) -> Tuple[List[Tuple[List, List, List]], List[Dict], List[Dict]]:
""" """
Runs through all the statement transactions to find a matching transaction Runs through all the statement transactions to find a matching transaction
in the books. If found, the books transaction is marked off so that it can in the books. If found, the books transaction is marked off so that it can
@ -266,9 +284,9 @@ def match_statement_and_books(statement_trans: list, books_trans: list):
remaining_statement_trans = [] remaining_statement_trans = []
for r1 in statement_trans: for r1 in statement_trans:
best_match_score = 0 best_match_score = 0.0
best_match_index = None best_match_index = None
best_match_note = '' best_match_note = []
matches_found = 0 matches_found = 0
for i, r2 in enumerate(books_trans): for i, r2 in enumerate(books_trans):
score, note = records_match(r1, r2) score, note = records_match(r1, r2)
@ -280,7 +298,8 @@ def match_statement_and_books(statement_trans: list, books_trans: list):
if best_match_score > 0.5 and matches_found == 1 and 'check-id mismatch' not in best_match_note or best_match_score > 0.8: if best_match_score > 0.5 and matches_found == 1 and 'check-id mismatch' not in best_match_note or best_match_score > 0.8:
matches.append(([r1], [books_trans[best_match_index]], best_match_note)) matches.append(([r1], [books_trans[best_match_index]], best_match_note))
# Don't try to make a second match against this books entry. # Don't try to make a second match against this books entry.
del books_trans[best_match_index] if best_match_index is not None:
del books_trans[best_match_index]
else: else:
remaining_statement_trans.append(r1) remaining_statement_trans.append(r1)
for r2 in books_trans: for r2 in books_trans:
@ -288,7 +307,9 @@ def match_statement_and_books(statement_trans: list, books_trans: list):
return matches, remaining_statement_trans, remaining_books_trans return matches, remaining_statement_trans, remaining_books_trans
def format_matches(matches, csv_statement: str, show_reconciled_matches): # TODO: Return list of tuples (instead of list of lists).
def format_matches(matches: List, csv_statement: str, show_reconciled_matches: bool) -> List[List]:
match_output = [] match_output = []
for r1s, r2s, note in matches: for r1s, r2s, note in matches:
note = ', '.join(note) note = ', '.join(note)
@ -306,14 +327,15 @@ def format_matches(matches, csv_statement: str, show_reconciled_matches):
return match_output return match_output
def date_proximity(d1, d2): def date_proximity(d1: datetime.date, d2: datetime.date) -> float:
diff = abs((d1 - d2).days) diff = abs(int((d1 - d2).days))
if diff > 60: if diff > 60:
return 0 return 0.0
else: else:
return 1.0 - (diff / 60.0) return 1.0 - (diff / 60.0)
def metadata_for_match(match, statement_filename, csv_filename):
def metadata_for_match(match: Tuple[List, List, List], statement_filename: str, csv_filename: str) -> List[Tuple[str, int, str]]:
# Can we really ever have multiple statement entries? Probably not. # Can we really ever have multiple statement entries? Probably not.
statement_filename = get_repo_relative_path(statement_filename) statement_filename = get_repo_relative_path(statement_filename)
csv_filename = get_repo_relative_path(csv_filename) csv_filename = get_repo_relative_path(csv_filename)
@ -361,15 +383,18 @@ def write_metadata_to_books(metadata_to_apply: List[Tuple[str, int, str]]) -> No
f.writelines(contents) f.writelines(contents)
print(f'Wrote {filename}.') print(f'Wrote {filename}.')
def get_repo_relative_path(path):
def get_repo_relative_path(path: str) -> str:
return os.path.relpath(path, start=os.getenv('CONSERVANCY_REPOSITORY')) return os.path.relpath(path, start=os.getenv('CONSERVANCY_REPOSITORY'))
def parse_path(path):
def parse_path(path: str) -> str:
if not os.path.exists(path): if not os.path.exists(path):
raise argparse.ArgumentTypeError(f'File {path} does not exist.') raise argparse.ArgumentTypeError(f'File {path} does not exist.')
return path return path
def parse_repo_relative_path(path):
def parse_repo_relative_path(path: str) -> str:
if not os.path.exists(path): if not os.path.exists(path):
raise argparse.ArgumentTypeError(f'File {path} does not exist.') raise argparse.ArgumentTypeError(f'File {path} does not exist.')
repo = os.getenv('CONSERVANCY_REPOSITORY') repo = os.getenv('CONSERVANCY_REPOSITORY')
@ -379,7 +404,8 @@ def parse_repo_relative_path(path):
raise argparse.ArgumentTypeError(f'File {path} does not share a common prefix with $CONSERVANCY_REPOSITORY {repo}.') raise argparse.ArgumentTypeError(f'File {path} does not share a common prefix with $CONSERVANCY_REPOSITORY {repo}.')
return path return path
def parse_args(argv):
def parse_args(argv: List[str]) -> argparse.Namespace:
parser = argparse.ArgumentParser(description='Reconciliation helper') parser = argparse.ArgumentParser(description='Reconciliation helper')
parser.add_argument('--beancount-file', required=True, type=parse_path) parser.add_argument('--beancount-file', required=True, type=parse_path)
parser.add_argument('--csv-statement', required=True, type=parse_repo_relative_path) parser.add_argument('--csv-statement', required=True, type=parse_repo_relative_path)
@ -392,7 +418,8 @@ def parse_args(argv):
parser.add_argument('--non-interactive', action='store_true', help="Don't prompt to write to the books") parser.add_argument('--non-interactive', action='store_true', help="Don't prompt to write to the books")
return parser.parse_args(args=argv[1:]) return parser.parse_args(args=argv[1:])
def totals(matches):
def totals(matches: List[Tuple[List, List, List]]) -> Tuple[decimal.Decimal, decimal.Decimal, decimal.Decimal]:
total_matched = decimal.Decimal(0) total_matched = decimal.Decimal(0)
total_missing_from_books = decimal.Decimal(0) total_missing_from_books = decimal.Decimal(0)
total_missing_from_statement = decimal.Decimal(0) total_missing_from_statement = decimal.Decimal(0)
@ -406,16 +433,16 @@ def totals(matches):
return total_matched, total_missing_from_books, total_missing_from_statement return total_matched, total_missing_from_books, total_missing_from_statement
def subset_match(statement_trans, books_trans): def subset_match(statement_trans: List[dict], books_trans: List[dict]) -> Tuple[List[Tuple[List, List, List]], List[Dict], List[Dict]]:
matches = [] matches = []
remaining_books_trans = [] remaining_books_trans = []
remaining_statement_trans = [] remaining_statement_trans = []
groups = itertools.groupby(books_trans, key=lambda x: (x['date'], x['payee'])) groups = itertools.groupby(books_trans, key=lambda x: (x['date'], x['payee']))
for k, group in groups: for k, group in groups:
best_match_score = 0 best_match_score = 0.0
best_match_index = None best_match_index = None
best_match_note = '' best_match_note = []
matches_found = 0 matches_found = 0
group_items = list(group) group_items = list(group)
@ -430,11 +457,11 @@ def subset_match(statement_trans, books_trans):
best_match_index = i best_match_index = i
best_match_note = note best_match_note = note
if best_match_score > 0.5 and matches_found == 1 and 'check-id mismatch' not in best_match_note or best_match_score > 0.8: if best_match_score > 0.5 and matches_found == 1 and 'check-id mismatch' not in best_match_note or best_match_score > 0.8:
if best_match_score <= 0.8:
best_match_note.append('only one decent match')
matches.append(([statement_trans[best_match_index]], group_items, best_match_note)) matches.append(([statement_trans[best_match_index]], group_items, best_match_note))
del statement_trans[best_match_index] if best_match_index is not None:
del statement_trans[best_match_index]
for item in group_items: for item in group_items:
# TODO: Why?
books_trans.remove(item) books_trans.remove(item)
else: else:
remaining_books_trans.append(r2) remaining_books_trans.append(r2)
@ -442,26 +469,33 @@ def subset_match(statement_trans, books_trans):
remaining_statement_trans.append(r1) remaining_statement_trans.append(r1)
return matches, remaining_statement_trans, remaining_books_trans return matches, remaining_statement_trans, remaining_books_trans
def process_unmatched(statement_trans, books_trans):
matches = [] def process_unmatched(statement_trans: List[dict], books_trans: List[dict]) -> List[Tuple[List, List, List]]:
matches: List[Tuple[List, List, List]] = []
for r1 in statement_trans: for r1 in statement_trans:
matches.append(([r1], [], ['no match'])) matches.append(([r1], [], ['no match']))
for r2 in books_trans: for r2 in books_trans:
matches.append(([], [r2], ['no match'])) matches.append(([], [r2], ['no match']))
return matches return matches
def main(args):
def main(args: argparse.Namespace) -> None:
# TODO: Should put in a sanity check to make sure the statement you're feeding # TODO: Should put in a sanity check to make sure the statement you're feeding
# in matches the account you've provided. # in matches the account you've provided.
# TODO: Can we open the files first, then pass the streams on to the rest of the program? # TODO: Can we open the files first, then pass the streams on to the rest of the program?
if 'AMEX' in args.account: if 'AMEX' in args.account:
validate_csv = validate_amex_csv
standardize_statement_record = standardize_amex_record standardize_statement_record = standardize_amex_record
else: else:
validate_csv = validate_fr_csv
standardize_statement_record = standardize_fr_record standardize_statement_record = standardize_fr_record
with open(args.csv_statement) as f: with open(args.csv_statement) as f:
sample = f.read(200)
validate_csv(sample, args.account)
f.seek(0)
statement_trans = read_transactions_from_csv(f, standardize_statement_record) statement_trans = read_transactions_from_csv(f, standardize_statement_record)
begin_date = statement_trans[0]['date'] begin_date = statement_trans[0]['date']

View file

@ -218,7 +218,7 @@ def test_payee_mismatch_ok_when_only_one_that_amount_and_date():
statement = [S3] statement = [S3]
books = [B3_payee_mismatch_1] books = [B3_payee_mismatch_1]
assert match_statement_and_books(statement, books) == ( assert match_statement_and_books(statement, books) == (
[([S3], [B3_payee_mismatch_1], ['payee mismatch', 'only one decent match'])], [([S3], [B3_payee_mismatch_1], ['payee mismatch'])],
[], [],
[], [],
) )
@ -255,8 +255,8 @@ def test_payee_matches_when_first_word_matches():
def test_metadata_for_match(monkeypatch): def test_metadata_for_match(monkeypatch):
monkeypatch.setenv('CONSERVANCY_REPOSITORY', '.') monkeypatch.setenv('CONSERVANCY_REPOSITORY', '.')
assert metadata_for_match(([S1], [B1], []), 'statement.pdf', 'statement.csv') == [ assert metadata_for_match(([S1], [B1], []), 'statement.pdf', 'statement.csv') == [
('2022/imports.beancount', 777, ' bank-statement: statement.pdf'), ('2022/imports.beancount', 777, ' bank-statement: "statement.pdf"'),
('2022/imports.beancount', 777, ' bank-statement-csv: statement.csv:222'), ('2022/imports.beancount', 777, ' bank-statement-csv: "statement.csv:222"'),
] ]
def test_no_metadata_if_no_matches(): def test_no_metadata_if_no_matches():