hooks.add_entity: Remove common company suffixes from entity tags.
This commit is contained in:
parent
fb947e6bb5
commit
e8bcbd5f99
2 changed files with 51 additions and 16 deletions
|
@ -9,6 +9,15 @@ class AddEntityHook:
|
||||||
'la',
|
'la',
|
||||||
'van',
|
'van',
|
||||||
])
|
])
|
||||||
|
COMPANY_SUFFIXES = frozenset([
|
||||||
|
'co',
|
||||||
|
'company',
|
||||||
|
'corp',
|
||||||
|
'corporation',
|
||||||
|
'inc',
|
||||||
|
'incorporated',
|
||||||
|
'llc',
|
||||||
|
])
|
||||||
NONASCII_RE = re.compile(r'[^-A-Za-z0-9]')
|
NONASCII_RE = re.compile(r'[^-A-Za-z0-9]')
|
||||||
NONALNUM_RE = re.compile(r'[^-\w]')
|
NONALNUM_RE = re.compile(r'[^-\w]')
|
||||||
OPEN_PARENS = ['\\(', '\\[', '\\{']
|
OPEN_PARENS = ['\\(', '\\[', '\\{']
|
||||||
|
@ -43,30 +52,46 @@ class AddEntityHook:
|
||||||
if word:
|
if word:
|
||||||
yield word
|
yield word
|
||||||
|
|
||||||
def _str2entity(self, s, trim_re, name_shifts):
|
def _move_last_name(self, parts):
|
||||||
|
pivot = -2
|
||||||
|
try:
|
||||||
|
while parts[pivot].lower() in self.NAME_PREFIXES:
|
||||||
|
pivot -= 1
|
||||||
|
except IndexError:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
pivot += 1
|
||||||
|
parts = parts[pivot:] + parts[:pivot]
|
||||||
|
return parts
|
||||||
|
|
||||||
|
def _chop_corp_suffixes(self, parts):
|
||||||
|
for index in range(-1, -len(parts), -1):
|
||||||
|
if parts[index].lower() not in self.COMPANY_SUFFIXES:
|
||||||
|
del_from = index + 1
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
del_from = 1
|
||||||
|
if del_from != 0:
|
||||||
|
del parts[del_from:]
|
||||||
|
return parts
|
||||||
|
|
||||||
|
def _str2entity(self, s, trim_re, words_rearrange_func):
|
||||||
parts = list(self._entity_parts(s, trim_re))
|
parts = list(self._entity_parts(s, trim_re))
|
||||||
if name_shifts > 0:
|
if words_rearrange_func is not None:
|
||||||
pivot = -name_shifts - 1
|
parts = words_rearrange_func(parts)
|
||||||
try:
|
|
||||||
while parts[pivot].lower() in self.NAME_PREFIXES:
|
|
||||||
pivot -= 1
|
|
||||||
except IndexError:
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
pivot += 1
|
|
||||||
parts = parts[pivot:] + parts[:pivot]
|
|
||||||
return '-'.join(parts)
|
return '-'.join(parts)
|
||||||
|
|
||||||
def _name2entity(self, name, name_shifts):
|
def _name2entity(self, name, rearrange_func1, rearrange_func2):
|
||||||
name = self._remove_parens(name)
|
name = self._remove_parens(name)
|
||||||
name = self._destroke(name)
|
name = self._destroke(name)
|
||||||
entity = self._str2entity(name, self.NONASCII_RE, name_shifts)
|
entity = self._str2entity(name, self.NONASCII_RE, rearrange_func1)
|
||||||
if not entity:
|
if not entity:
|
||||||
entity = self._str2entity(name, self.NONALNUM_RE, 0)
|
entity = self._str2entity(name, self.NONALNUM_RE, rearrange_func2)
|
||||||
return entity
|
return entity
|
||||||
|
|
||||||
def run(self, data):
|
def run(self, data):
|
||||||
if ('payee' in data) and ('entity' not in data):
|
if ('payee' in data) and ('entity' not in data):
|
||||||
data['entity'] = self._name2entity(data['payee'], 1)
|
data['entity'] = self._name2entity(data['payee'], self._move_last_name, None)
|
||||||
if ('corporation' in data) and ('corp_entity' not in data):
|
if ('corporation' in data) and ('corp_entity' not in data):
|
||||||
data['corp_entity'] = self._name2entity(data['corporation'], 0)
|
data['corp_entity'] = self._name2entity(
|
||||||
|
data['corporation'], self._chop_corp_suffixes, self._chop_corp_suffixes)
|
||||||
|
|
|
@ -27,6 +27,16 @@ def test_load_all():
|
||||||
('payee', 'A de B de la C', 'entity', 'de-la-C-A-de-B'),
|
('payee', 'A de B de la C', 'entity', 'de-la-C-A-de-B'),
|
||||||
('corporation', 'Company A', 'corp_entity', 'Company-A'),
|
('corporation', 'Company A', 'corp_entity', 'Company-A'),
|
||||||
('corporation', 'Company A 99', 'corp_entity', 'Company-A-99'),
|
('corporation', 'Company A 99', 'corp_entity', 'Company-A-99'),
|
||||||
|
('corporation', 'DX Co.', 'corp_entity', 'DX'),
|
||||||
|
('corporation', 'DX Company', 'corp_entity', 'DX'),
|
||||||
|
('corporation', 'DX Company Inc.', 'corp_entity', 'DX'),
|
||||||
|
('corporation', 'DX Corp', 'corp_entity', 'DX'),
|
||||||
|
('corporation', 'DX Corp LLC', 'corp_entity', 'DX'),
|
||||||
|
('corporation', 'DX Corporation', 'corp_entity', 'DX'),
|
||||||
|
('corporation', 'DX, Inc.', 'corp_entity', 'DX'),
|
||||||
|
('corporation', 'DX Incorporated', 'corp_entity', 'DX'),
|
||||||
|
('payee', 'Poe Inc', 'entity', 'Inc-Poe'),
|
||||||
|
('corporation', 'Silly Van', 'corp_entity', 'Silly-Van'),
|
||||||
])
|
])
|
||||||
def test_add_entity(in_key, payee, out_key, expected):
|
def test_add_entity(in_key, payee, out_key, expected):
|
||||||
data = {in_key: payee}
|
data = {in_key: payee}
|
||||||
|
|
Loading…
Reference in a new issue