extract: Lay the groundwork for specialized PDF extractors.
* Start a whole extract submodule. * Parametrize FormExtractor. * Add a FormExtractor._transform_fields() hook.
This commit is contained in:
parent
8b2633ec23
commit
1908358c30
2 changed files with 21 additions and 9 deletions
|
@ -1,4 +1,4 @@
|
||||||
"""extract.py - Extract form data from PDF files"""
|
"""pdfforms/extract/__init__.py - Extract form data from PDF files"""
|
||||||
# Copyright © 2021 Brett Smith
|
# Copyright © 2021 Brett Smith
|
||||||
# License: AGPLv3-or-later WITH Beancount-Plugin-Additional-Permission-1.0
|
# License: AGPLv3-or-later WITH Beancount-Plugin-Additional-Permission-1.0
|
||||||
#
|
#
|
||||||
|
@ -12,9 +12,9 @@ import sys
|
||||||
|
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
from . import fields as fieldmod
|
from .. import fields as fieldmod
|
||||||
from . import utils as pdfutils
|
from .. import utils as pdfutils
|
||||||
from .. import cliutil
|
from ... import cliutil
|
||||||
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from pdfminer.pdfdocument import PDFDocument # type:ignore[import]
|
from pdfminer.pdfdocument import PDFDocument # type:ignore[import]
|
||||||
|
@ -24,11 +24,13 @@ from pdfminer.pdftypes import resolve1 # type:ignore[import]
|
||||||
from typing import (
|
from typing import (
|
||||||
Any,
|
Any,
|
||||||
BinaryIO,
|
BinaryIO,
|
||||||
|
Iterable,
|
||||||
Iterator,
|
Iterator,
|
||||||
Mapping,
|
Mapping,
|
||||||
Optional,
|
Optional,
|
||||||
Sequence,
|
Sequence,
|
||||||
TextIO,
|
TextIO,
|
||||||
|
Type,
|
||||||
)
|
)
|
||||||
|
|
||||||
PROGNAME = 'pdfform-extract'
|
PROGNAME = 'pdfform-extract'
|
||||||
|
@ -100,14 +102,22 @@ class FormExtractor:
|
||||||
for kid in field.kids():
|
for kid in field.kids():
|
||||||
yield from self._extract_field(kid, name)
|
yield from self._extract_field(kid, name)
|
||||||
|
|
||||||
|
def _transform_fields(self, fields: Iterable[fieldmod.FormField]) -> None:
|
||||||
|
pass
|
||||||
|
|
||||||
def extract(self) -> Mapping[str, Any]:
|
def extract(self) -> Mapping[str, Any]:
|
||||||
|
fields = [
|
||||||
|
fieldmod.FormField.by_type(resolve1(field_source))
|
||||||
|
for field_source in resolve1(self.document.catalog[self.form_key])['Fields']
|
||||||
|
]
|
||||||
|
self._transform_fields(iter(fields))
|
||||||
return {
|
return {
|
||||||
'from file': self.source,
|
'from file': self.source,
|
||||||
'form key': self.form_key,
|
'form key': self.form_key,
|
||||||
'fields': [
|
'fields': [
|
||||||
field
|
yaml_field
|
||||||
for field_source in resolve1(self.document.catalog[self.form_key])['Fields']
|
for field in fields
|
||||||
for field in self._extract_field(fieldmod.FormField.by_type(resolve1(field_source)))
|
for yaml_field in self._extract_field(field)
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -155,15 +165,16 @@ Use `-` to read from stdin.
|
||||||
def main(arglist: Optional[Sequence[str]]=None,
|
def main(arglist: Optional[Sequence[str]]=None,
|
||||||
stdout: TextIO=sys.stdout,
|
stdout: TextIO=sys.stdout,
|
||||||
stderr: TextIO=sys.stderr,
|
stderr: TextIO=sys.stderr,
|
||||||
|
extract_cls: Type[FormExtractor]=FormExtractor,
|
||||||
) -> int:
|
) -> int:
|
||||||
args = parse_arguments(arglist)
|
args = parse_arguments(arglist)
|
||||||
cliutil.set_loglevel(logger, args.loglevel)
|
cliutil.set_loglevel(logger, args.loglevel)
|
||||||
with contextlib.ExitStack() as exit_stack:
|
with contextlib.ExitStack() as exit_stack:
|
||||||
if args.document == cliutil.STDSTREAM_PATH:
|
if args.document == cliutil.STDSTREAM_PATH:
|
||||||
extractor = FormExtractor.from_file(sys.stdin.buffer, args.form_key)
|
extractor = extract_cls.from_file(sys.stdin.buffer, args.form_key)
|
||||||
else:
|
else:
|
||||||
extractor = exit_stack.enter_context(
|
extractor = exit_stack.enter_context(
|
||||||
FormExtractor.from_path(args.document, args.form_key),
|
extract_cls.from_path(args.document, args.form_key),
|
||||||
)
|
)
|
||||||
extracted_form = extractor.extract()
|
extracted_form = extractor.extract()
|
||||||
with contextlib.ExitStack() as exit_stack:
|
with contextlib.ExitStack() as exit_stack:
|
1
setup.py
1
setup.py
|
@ -33,6 +33,7 @@ setup(
|
||||||
packages=[
|
packages=[
|
||||||
'conservancy_beancount',
|
'conservancy_beancount',
|
||||||
'conservancy_beancount.pdfforms',
|
'conservancy_beancount.pdfforms',
|
||||||
|
'conservancy_beancount.pdfforms.extract',
|
||||||
'conservancy_beancount.plugin',
|
'conservancy_beancount.plugin',
|
||||||
'conservancy_beancount.reports',
|
'conservancy_beancount.reports',
|
||||||
'conservancy_beancount.tools',
|
'conservancy_beancount.tools',
|
||||||
|
|
Loading…
Reference in a new issue