irs990scheduleA: New PDF extractor.

This commit is contained in:
Brett Smith 2021-01-09 15:17:55 -05:00
parent 1908358c30
commit 1c95c1b1b1
2 changed files with 85 additions and 1 deletions

View file

@ -0,0 +1,83 @@
"""irs990scheduleA.py - Extract IRS 990 Schedule A form data from the prior FY"""
# Copyright © 2021 Brett Smith
# License: AGPLv3-or-later WITH Beancount-Plugin-Additional-Permission-1.0
#
# Full copyright and licensing details can be found at toplevel file
# LICENSE.txt in the repository.
import collections
import functools
import itertools
import logging
from . import FormExtractor, main
from .. import fields as fieldmod
from ... import cliutil
from typing import (
Iterable,
Iterator,
Optional,
Tuple,
)
PROGNAME = 'pdfform-extract-irs990scheduleA'
logger = logging.getLogger('conservancy_beancount.pdfforms.extract.irs990scheduleA')
def _make_shifts(
key_fmt: str,
start_count: int,
shift_count: int=4,
clear_count: int=2,
) -> Iterator[Tuple[str, Optional[str]]]:
for index in range(start_count, start_count + shift_count):
yield (key_fmt.format(index), key_fmt.format(index + 1))
index += 1
for index in range(index, index + clear_count):
yield (key_fmt.format(index), None)
class IRS990ScheduleAExtractor(FormExtractor):
_BLANK_FIELDS = [
'topmostSubform[0].Page2[0].Table_SectionA[0].Line5[0].f2_25[0]',
'topmostSubform[0].Page2[0].Table_SectionA[0].Line6[0].f2_26[0]',
'topmostSubform[0].Page2[0].Table_SectionB[0].Line11[0].f2_51[0]',
'topmostSubform[0].Page2[0].f2_52[0]',
'topmostSubform[0].Page2[0].f2_53[0]',
'topmostSubform[0].Page2[0].c2_2[0]',
'topmostSubform[0].Page2[0].c2_4[0]',
]
_FIELD_SOURCES = dict(itertools.chain(
_make_shifts('topmostSubform[0].Page2[0].Table_SectionA[0].Line1[0].f2_{}[0]', 1),
_make_shifts('topmostSubform[0].Page2[0].Table_SectionA[0].Line2[0].f2_{}[0]', 7),
_make_shifts('topmostSubform[0].Page2[0].Table_SectionA[0].Line3[0].f2_{}[0]', 13),
_make_shifts('topmostSubform[0].Page2[0].Table_SectionA[0].Line4[0].f2_{}[0]', 19),
_make_shifts('topmostSubform[0].Page2[0].Table_SectionB[0].Line7[0].f2_{}[0]', 27),
_make_shifts('topmostSubform[0].Page2[0].Table_SectionB[0].Line8[0].f2_{}[0]', 33),
_make_shifts('topmostSubform[0].Page2[0].Table_SectionB[0].Line9[0].f2_{}[0]', 39),
_make_shifts('topmostSubform[0].Page2[0].Table_SectionB[0].Line10[0].f2_{}[0]', 45),
iter((key, None) for key in _BLANK_FIELDS),
))
# Part II Section C
_FIELD_SOURCES['topmostSubform[0].Page2[0].f2_54[0]'] = 'topmostSubform[0].Page2[0].f2_53[0]'
_FIELD_SOURCES['topmostSubform[0].Page2[0].c2_3[0]'] = 'topmostSubform[0].Page2[0].c2_2[0]'
_FIELD_SOURCES['topmostSubform[0].Page2[0].c2_5[0]'] = 'topmostSubform[0].Page2[0].c2_4[0]'
def _transform_fields(self, fields: Iterable[fieldmod.FormField]) -> None:
fields_map = dict(
kvpair
for field in fields
for kvpair in field.as_mapping()
)
new_values = {
key: None if src_key is None else fields_map[src_key].value()
for key, src_key in self._FIELD_SOURCES.items()
}
for key, value in new_values.items():
fields_map[key].set_value(value)
main = functools.partial(main, extract_cls=IRS990ScheduleAExtractor)
entry_point = cliutil.make_entry_point(__name__, PROGNAME)
if __name__ == '__main__':
exit(entry_point())

View file

@ -5,7 +5,7 @@ from setuptools import setup
setup(
name='conservancy_beancount',
description="Plugin, library, and reports for reading Conservancy's books",
version='1.15.2',
version='1.15.3',
author='Software Freedom Conservancy',
author_email='info@sfconservancy.org',
license='GNU AGPLv3+',
@ -50,6 +50,7 @@ setup(
'ledger-report = conservancy_beancount.reports.ledger:entry_point',
'opening-balances = conservancy_beancount.tools.opening_balances:entry_point',
'pdfform-extract = conservancy_beancount.pdfforms.extract:entry_point',
'pdfform-extract-irs990scheduleA = conservancy_beancount.pdfforms.extract.irs990scheduleA:entry_point',
'pdfform-fill = conservancy_beancount.pdfforms.fill:entry_point',
'split-ods-links = conservancy_beancount.tools.split_ods_links:entry_point',
],