extract: Lay the groundwork for specialized PDF extractors.

* Start a whole extract submodule.
* Parametrize FormExtractor.
* Add a FormExtractor._transform_fields() hook.
This commit is contained in:
Brett Smith 2021-01-09 15:16:07 -05:00
parent 8b2633ec23
commit 1908358c30
2 changed files with 21 additions and 9 deletions

View file

@ -1,4 +1,4 @@
"""extract.py - Extract form data from PDF files""" """pdfforms/extract/__init__.py - Extract form data from PDF files"""
# Copyright © 2021 Brett Smith # Copyright © 2021 Brett Smith
# License: AGPLv3-or-later WITH Beancount-Plugin-Additional-Permission-1.0 # License: AGPLv3-or-later WITH Beancount-Plugin-Additional-Permission-1.0
# #
@ -12,9 +12,9 @@ import sys
import yaml import yaml
from . import fields as fieldmod from .. import fields as fieldmod
from . import utils as pdfutils from .. import utils as pdfutils
from .. import cliutil from ... import cliutil
from pathlib import Path from pathlib import Path
from pdfminer.pdfdocument import PDFDocument # type:ignore[import] from pdfminer.pdfdocument import PDFDocument # type:ignore[import]
@ -24,11 +24,13 @@ from pdfminer.pdftypes import resolve1 # type:ignore[import]
from typing import ( from typing import (
Any, Any,
BinaryIO, BinaryIO,
Iterable,
Iterator, Iterator,
Mapping, Mapping,
Optional, Optional,
Sequence, Sequence,
TextIO, TextIO,
Type,
) )
PROGNAME = 'pdfform-extract' PROGNAME = 'pdfform-extract'
@ -100,14 +102,22 @@ class FormExtractor:
for kid in field.kids(): for kid in field.kids():
yield from self._extract_field(kid, name) yield from self._extract_field(kid, name)
def _transform_fields(self, fields: Iterable[fieldmod.FormField]) -> None:
pass
def extract(self) -> Mapping[str, Any]: def extract(self) -> Mapping[str, Any]:
fields = [
fieldmod.FormField.by_type(resolve1(field_source))
for field_source in resolve1(self.document.catalog[self.form_key])['Fields']
]
self._transform_fields(iter(fields))
return { return {
'from file': self.source, 'from file': self.source,
'form key': self.form_key, 'form key': self.form_key,
'fields': [ 'fields': [
field yaml_field
for field_source in resolve1(self.document.catalog[self.form_key])['Fields'] for field in fields
for field in self._extract_field(fieldmod.FormField.by_type(resolve1(field_source))) for yaml_field in self._extract_field(field)
], ],
} }
@ -155,15 +165,16 @@ Use `-` to read from stdin.
def main(arglist: Optional[Sequence[str]]=None, def main(arglist: Optional[Sequence[str]]=None,
stdout: TextIO=sys.stdout, stdout: TextIO=sys.stdout,
stderr: TextIO=sys.stderr, stderr: TextIO=sys.stderr,
extract_cls: Type[FormExtractor]=FormExtractor,
) -> int: ) -> int:
args = parse_arguments(arglist) args = parse_arguments(arglist)
cliutil.set_loglevel(logger, args.loglevel) cliutil.set_loglevel(logger, args.loglevel)
with contextlib.ExitStack() as exit_stack: with contextlib.ExitStack() as exit_stack:
if args.document == cliutil.STDSTREAM_PATH: if args.document == cliutil.STDSTREAM_PATH:
extractor = FormExtractor.from_file(sys.stdin.buffer, args.form_key) extractor = extract_cls.from_file(sys.stdin.buffer, args.form_key)
else: else:
extractor = exit_stack.enter_context( extractor = exit_stack.enter_context(
FormExtractor.from_path(args.document, args.form_key), extract_cls.from_path(args.document, args.form_key),
) )
extracted_form = extractor.extract() extracted_form = extractor.extract()
with contextlib.ExitStack() as exit_stack: with contextlib.ExitStack() as exit_stack:

View file

@ -33,6 +33,7 @@ setup(
packages=[ packages=[
'conservancy_beancount', 'conservancy_beancount',
'conservancy_beancount.pdfforms', 'conservancy_beancount.pdfforms',
'conservancy_beancount.pdfforms.extract',
'conservancy_beancount.plugin', 'conservancy_beancount.plugin',
'conservancy_beancount.reports', 'conservancy_beancount.reports',
'conservancy_beancount.tools', 'conservancy_beancount.tools',