extract: Lay the groundwork for specialized PDF extractors.
* Start a whole extract submodule. * Parametrize FormExtractor. * Add a FormExtractor._transform_fields() hook.
This commit is contained in:
parent
8b2633ec23
commit
1908358c30
2 changed files with 21 additions and 9 deletions
|
@ -1,4 +1,4 @@
|
|||
"""extract.py - Extract form data from PDF files"""
|
||||
"""pdfforms/extract/__init__.py - Extract form data from PDF files"""
|
||||
# Copyright © 2021 Brett Smith
|
||||
# License: AGPLv3-or-later WITH Beancount-Plugin-Additional-Permission-1.0
|
||||
#
|
||||
|
@ -12,9 +12,9 @@ import sys
|
|||
|
||||
import yaml
|
||||
|
||||
from . import fields as fieldmod
|
||||
from . import utils as pdfutils
|
||||
from .. import cliutil
|
||||
from .. import fields as fieldmod
|
||||
from .. import utils as pdfutils
|
||||
from ... import cliutil
|
||||
|
||||
from pathlib import Path
|
||||
from pdfminer.pdfdocument import PDFDocument # type:ignore[import]
|
||||
|
@ -24,11 +24,13 @@ from pdfminer.pdftypes import resolve1 # type:ignore[import]
|
|||
from typing import (
|
||||
Any,
|
||||
BinaryIO,
|
||||
Iterable,
|
||||
Iterator,
|
||||
Mapping,
|
||||
Optional,
|
||||
Sequence,
|
||||
TextIO,
|
||||
Type,
|
||||
)
|
||||
|
||||
PROGNAME = 'pdfform-extract'
|
||||
|
@ -100,14 +102,22 @@ class FormExtractor:
|
|||
for kid in field.kids():
|
||||
yield from self._extract_field(kid, name)
|
||||
|
||||
def _transform_fields(self, fields: Iterable[fieldmod.FormField]) -> None:
|
||||
pass
|
||||
|
||||
def extract(self) -> Mapping[str, Any]:
|
||||
fields = [
|
||||
fieldmod.FormField.by_type(resolve1(field_source))
|
||||
for field_source in resolve1(self.document.catalog[self.form_key])['Fields']
|
||||
]
|
||||
self._transform_fields(iter(fields))
|
||||
return {
|
||||
'from file': self.source,
|
||||
'form key': self.form_key,
|
||||
'fields': [
|
||||
field
|
||||
for field_source in resolve1(self.document.catalog[self.form_key])['Fields']
|
||||
for field in self._extract_field(fieldmod.FormField.by_type(resolve1(field_source)))
|
||||
yaml_field
|
||||
for field in fields
|
||||
for yaml_field in self._extract_field(field)
|
||||
],
|
||||
}
|
||||
|
||||
|
@ -155,15 +165,16 @@ Use `-` to read from stdin.
|
|||
def main(arglist: Optional[Sequence[str]]=None,
|
||||
stdout: TextIO=sys.stdout,
|
||||
stderr: TextIO=sys.stderr,
|
||||
extract_cls: Type[FormExtractor]=FormExtractor,
|
||||
) -> int:
|
||||
args = parse_arguments(arglist)
|
||||
cliutil.set_loglevel(logger, args.loglevel)
|
||||
with contextlib.ExitStack() as exit_stack:
|
||||
if args.document == cliutil.STDSTREAM_PATH:
|
||||
extractor = FormExtractor.from_file(sys.stdin.buffer, args.form_key)
|
||||
extractor = extract_cls.from_file(sys.stdin.buffer, args.form_key)
|
||||
else:
|
||||
extractor = exit_stack.enter_context(
|
||||
FormExtractor.from_path(args.document, args.form_key),
|
||||
extract_cls.from_path(args.document, args.form_key),
|
||||
)
|
||||
extracted_form = extractor.extract()
|
||||
with contextlib.ExitStack() as exit_stack:
|
1
setup.py
1
setup.py
|
@ -33,6 +33,7 @@ setup(
|
|||
packages=[
|
||||
'conservancy_beancount',
|
||||
'conservancy_beancount.pdfforms',
|
||||
'conservancy_beancount.pdfforms.extract',
|
||||
'conservancy_beancount.plugin',
|
||||
'conservancy_beancount.reports',
|
||||
'conservancy_beancount.tools',
|
||||
|
|
Loading…
Reference in a new issue