extract: Lay the groundwork for specialized PDF extractors.

* Start a whole extract submodule.
* Parametrize FormExtractor.
* Add a FormExtractor._transform_fields() hook.
This commit is contained in:
Brett Smith 2021-01-09 15:16:07 -05:00
parent 8b2633ec23
commit 1908358c30
2 changed files with 21 additions and 9 deletions

View file

@ -1,4 +1,4 @@
"""extract.py - Extract form data from PDF files"""
"""pdfforms/extract/__init__.py - Extract form data from PDF files"""
# Copyright © 2021 Brett Smith
# License: AGPLv3-or-later WITH Beancount-Plugin-Additional-Permission-1.0
#
@ -12,9 +12,9 @@ import sys
import yaml
from . import fields as fieldmod
from . import utils as pdfutils
from .. import cliutil
from .. import fields as fieldmod
from .. import utils as pdfutils
from ... import cliutil
from pathlib import Path
from pdfminer.pdfdocument import PDFDocument # type:ignore[import]
@ -24,11 +24,13 @@ from pdfminer.pdftypes import resolve1 # type:ignore[import]
from typing import (
Any,
BinaryIO,
Iterable,
Iterator,
Mapping,
Optional,
Sequence,
TextIO,
Type,
)
PROGNAME = 'pdfform-extract'
@ -100,14 +102,22 @@ class FormExtractor:
for kid in field.kids():
yield from self._extract_field(kid, name)
def _transform_fields(self, fields: Iterable[fieldmod.FormField]) -> None:
pass
def extract(self) -> Mapping[str, Any]:
fields = [
fieldmod.FormField.by_type(resolve1(field_source))
for field_source in resolve1(self.document.catalog[self.form_key])['Fields']
]
self._transform_fields(iter(fields))
return {
'from file': self.source,
'form key': self.form_key,
'fields': [
field
for field_source in resolve1(self.document.catalog[self.form_key])['Fields']
for field in self._extract_field(fieldmod.FormField.by_type(resolve1(field_source)))
yaml_field
for field in fields
for yaml_field in self._extract_field(field)
],
}
@ -155,15 +165,16 @@ Use `-` to read from stdin.
def main(arglist: Optional[Sequence[str]]=None,
stdout: TextIO=sys.stdout,
stderr: TextIO=sys.stderr,
extract_cls: Type[FormExtractor]=FormExtractor,
) -> int:
args = parse_arguments(arglist)
cliutil.set_loglevel(logger, args.loglevel)
with contextlib.ExitStack() as exit_stack:
if args.document == cliutil.STDSTREAM_PATH:
extractor = FormExtractor.from_file(sys.stdin.buffer, args.form_key)
extractor = extract_cls.from_file(sys.stdin.buffer, args.form_key)
else:
extractor = exit_stack.enter_context(
FormExtractor.from_path(args.document, args.form_key),
extract_cls.from_path(args.document, args.form_key),
)
extracted_form = extractor.extract()
with contextlib.ExitStack() as exit_stack:

View file

@ -33,6 +33,7 @@ setup(
packages=[
'conservancy_beancount',
'conservancy_beancount.pdfforms',
'conservancy_beancount.pdfforms.extract',
'conservancy_beancount.plugin',
'conservancy_beancount.reports',
'conservancy_beancount.tools',