reconciler: Move other score thresholds to constants

This commit is contained in:
Ben Sturmfels 2023-02-11 18:29:52 +11:00
parent baa299c4c5
commit 86f4232df1
Signed by: bsturmfels
GPG key ID: 023C05E2C9C068F0

View file

@ -152,8 +152,10 @@ JUNK_WORDS = [
] ]
JUNK_WORDS_RES = [re.compile(word, re.IGNORECASE) for word in JUNK_WORDS] JUNK_WORDS_RES = [re.compile(word, re.IGNORECASE) for word in JUNK_WORDS]
ZERO_RE = re.compile('^0+') ZERO_RE = re.compile('^0+')
FULL_MATCH_THRESHOLD = 0.8 PAYEE_FULL_MATCH_THRESHOLD = 0.8
PARTIAL_MATCH_THRESHOLD = 0.4 PAYEE_PARTIAL_MATCH_THRESHOLD = 0.4
OVERALL_EXCELLENT_MATCH_THRESHOLD = 0.8 # Clear winner
OVERALL_ACCEPTABLE_MATCH_THRESHOLD = 0.5 # Acceptable if only one match found
def remove_duplicate_words(text: str) -> str: def remove_duplicate_words(text: str) -> str:
@ -392,9 +394,9 @@ def records_match(r1: Dict, r2: Dict) -> Tuple[float, List[str]]:
else: else:
check_score = 0.0 check_score = 0.0
payee_score = payee_match(r1['payee'], r2['payee']) payee_score = payee_match(r1['payee'], r2['payee'])
if payee_score > FULL_MATCH_THRESHOLD: if payee_score > PAYEE_FULL_MATCH_THRESHOLD:
payee_message = '' payee_message = ''
elif payee_score > PARTIAL_MATCH_THRESHOLD: elif payee_score > PAYEE_PARTIAL_MATCH_THRESHOLD:
payee_message = 'partial payee match' payee_message = 'partial payee match'
else: else:
payee_message = 'payee mismatch' payee_message = 'payee mismatch'
@ -435,16 +437,16 @@ def match_statement_and_books(
matches_found = 0 matches_found = 0
for i, r2 in enumerate(books_trans): for i, r2 in enumerate(books_trans):
score, note = records_match(r1, r2) score, note = records_match(r1, r2)
if score >= 0.5 and score >= best_match_score: if score >= OVERALL_ACCEPTABLE_MATCH_THRESHOLD and score >= best_match_score:
matches_found += 1 matches_found += 1
best_match_score = score best_match_score = score
best_match_index = i best_match_index = i
best_match_note = note best_match_note = note
if ( if (
best_match_score > 0.5 best_match_score > OVERALL_ACCEPTABLE_MATCH_THRESHOLD
and matches_found == 1 and matches_found == 1
and 'check-id mismatch' not in best_match_note and 'check-id mismatch' not in best_match_note
or best_match_score > 0.8 or best_match_score > OVERALL_EXCELLENT_MATCH_THRESHOLD
): ):
matches.append(([r1], [books_trans[best_match_index]], best_match_note)) matches.append(([r1], [books_trans[best_match_index]], best_match_note))
# Don't try to make a second match against this books entry. # Don't try to make a second match against this books entry.
@ -484,16 +486,16 @@ def subset_match(
r2['amount'] = total r2['amount'] = total
for i, r1 in enumerate(statement_trans): for i, r1 in enumerate(statement_trans):
score, note = records_match(r1, r2) score, note = records_match(r1, r2)
if score >= 0.5 and score >= best_match_score: if score >= OVERALL_ACCEPTABLE_MATCH_THRESHOLD and score >= best_match_score:
matches_found += 1 matches_found += 1
best_match_score = score best_match_score = score
best_match_index = i best_match_index = i
best_match_note = note best_match_note = note
if ( if (
best_match_score > 0.5 best_match_score > OVERALL_ACCEPTABLE_MATCH_THRESHOLD
and matches_found == 1 and matches_found == 1
and 'check-id mismatch' not in best_match_note and 'check-id mismatch' not in best_match_note
or best_match_score > 0.8 or best_match_score > OVERALL_EXCELLENT_MATCH_THRESHOLD
): ):
matches.append( matches.append(
([statement_trans[best_match_index]], group_items, best_match_note) ([statement_trans[best_match_index]], group_items, best_match_note)
@ -795,6 +797,8 @@ def main(
statement_trans = read_csv(f) statement_trans = read_csv(f)
# Dates are taken from the beginning/end of the statement. # Dates are taken from the beginning/end of the statement.
# TODO: FR statements include the last day of previous statement and the
# last day of this statement in the first/last rows.
begin_date = statement_trans[0]['date'] begin_date = statement_trans[0]['date']
end_date = statement_trans[-1]['date'] end_date = statement_trans[-1]['date']