100 lines
3 KiB
Python
100 lines
3 KiB
Python
import re
|
|
import unicodedata
|
|
|
|
from . import HOOK_KINDS
|
|
|
|
class AddEntityHook:
|
|
KIND = HOOK_KINDS.DATA_MUNGER
|
|
NAME_PREFIXES = frozenset([
|
|
'da',
|
|
'de',
|
|
'der',
|
|
'la',
|
|
'van',
|
|
])
|
|
COMPANY_SUFFIXES = frozenset([
|
|
'co',
|
|
'company',
|
|
'corp',
|
|
'corporation',
|
|
'inc',
|
|
'incorporated',
|
|
'llc',
|
|
])
|
|
NONASCII_RE = re.compile(r'[^-A-Za-z0-9]')
|
|
NONALNUM_RE = re.compile(r'[^-\w]')
|
|
OPEN_PARENS = ['\\(', '\\[', '\\{']
|
|
CLOSE_PARENS = ['\\)', '\\]', '\\}']
|
|
NO_PARENS = '[^{}]*'.format(''.join(OPEN_PARENS + CLOSE_PARENS))
|
|
|
|
def __init__(self, config):
|
|
pass
|
|
|
|
def _remove_parens(self, s):
|
|
last_s = None
|
|
while s != last_s:
|
|
last_s = s
|
|
for open_c, close_c in zip(self.OPEN_PARENS, self.CLOSE_PARENS):
|
|
s = re.sub(open_c + self.NO_PARENS + close_c, '', s)
|
|
return s if s else last_s
|
|
|
|
def _destroke_chr(self, c):
|
|
name = unicodedata.name(c, '')
|
|
if name.endswith(' WITH STROKE'):
|
|
return unicodedata.lookup(name[:-12])
|
|
else:
|
|
return c
|
|
|
|
def _destroke(self, s):
|
|
return ''.join(self._destroke_chr(c) for c in s)
|
|
|
|
def _entity_parts(self, s, trim_re):
|
|
for word in s.split():
|
|
word = unicodedata.normalize('NFKD', word)
|
|
word = trim_re.sub('', word)
|
|
if word:
|
|
yield word
|
|
|
|
def _move_last_name(self, parts):
|
|
pivot = -2
|
|
try:
|
|
while parts[pivot].lower() in self.NAME_PREFIXES:
|
|
pivot -= 1
|
|
except IndexError:
|
|
pass
|
|
else:
|
|
pivot += 1
|
|
parts = parts[pivot:] + parts[:pivot]
|
|
return parts
|
|
|
|
def _chop_corp_suffixes(self, parts):
|
|
for index in range(-1, -len(parts), -1):
|
|
if parts[index].lower() not in self.COMPANY_SUFFIXES:
|
|
del_from = index + 1
|
|
break
|
|
else:
|
|
del_from = 1
|
|
if del_from != 0:
|
|
del parts[del_from:]
|
|
return parts
|
|
|
|
def _str2entity(self, s, trim_re, words_rearrange_func):
|
|
parts = list(self._entity_parts(s, trim_re))
|
|
if words_rearrange_func is not None:
|
|
parts = words_rearrange_func(parts)
|
|
return '-'.join(parts)
|
|
|
|
def _name2entity(self, name, rearrange_func1, rearrange_func2):
|
|
name = self._remove_parens(name)
|
|
name = self._destroke(name)
|
|
entity = self._str2entity(name, self.NONASCII_RE, rearrange_func1)
|
|
if not entity:
|
|
entity = self._str2entity(name, self.NONALNUM_RE, rearrange_func2)
|
|
return entity
|
|
|
|
def run(self, data):
|
|
if ('payee' in data) and ('entity' not in data):
|
|
data['entity'] = self._name2entity(data['payee'], self._move_last_name, None)
|
|
if ('corporation' in data) and ('corp_entity' not in data):
|
|
data['corp_entity'] = self._name2entity(
|
|
data['corporation'], self._chop_corp_suffixes, self._chop_corp_suffixes)
|