pdfforms: Initial module and tool to extract PDF form data to YAML.
Next steps: * A tool to fill the PDF form based on values written to that YAML. * An extension to fill some of those values with numbers queried from the books (which is why we need something more involved than FDF).
This commit is contained in:
parent
1b7fdf4f3b
commit
13c66e8ce2
10 changed files with 1007 additions and 0 deletions
0
conservancy_beancount/pdfforms/__init__.py
Normal file
0
conservancy_beancount/pdfforms/__init__.py
Normal file
18
conservancy_beancount/pdfforms/errors.py
Normal file
18
conservancy_beancount/pdfforms/errors.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
"""errors.py - Exception classes for PDF reporting errors"""
|
||||
# Copyright © 2021 Brett Smith
|
||||
# License: AGPLv3-or-later WITH Beancount-Plugin-Additional-Permission-1.0
|
||||
#
|
||||
# Full copyright and licensing details can be found at toplevel file
|
||||
# LICENSE.txt in the repository.
|
||||
|
||||
class PDFError(Exception):
|
||||
pass
|
||||
|
||||
class PDFKeyError(KeyError, PDFError):
|
||||
pass
|
||||
|
||||
class PDFSpecError(ValueError, PDFError):
|
||||
pass
|
||||
|
||||
class NoFormDataError(ValueError, PDFError):
|
||||
pass
|
179
conservancy_beancount/pdfforms/extract.py
Normal file
179
conservancy_beancount/pdfforms/extract.py
Normal file
|
@ -0,0 +1,179 @@
|
|||
"""extract.py - Extract form data from PDF files"""
|
||||
# Copyright © 2021 Brett Smith
|
||||
# License: AGPLv3-or-later WITH Beancount-Plugin-Additional-Permission-1.0
|
||||
#
|
||||
# Full copyright and licensing details can be found at toplevel file
|
||||
# LICENSE.txt in the repository.
|
||||
|
||||
import argparse
|
||||
import contextlib
|
||||
import logging
|
||||
import sys
|
||||
|
||||
import yaml
|
||||
|
||||
from . import fields as fieldmod
|
||||
from . import utils as pdfutils
|
||||
from .. import cliutil
|
||||
|
||||
from pathlib import Path
|
||||
from pdfminer.pdfdocument import PDFDocument # type:ignore[import]
|
||||
from pdfminer.pdfparser import PDFParser # type:ignore[import]
|
||||
from pdfminer.pdftypes import resolve1 # type:ignore[import]
|
||||
|
||||
from typing import (
|
||||
Any,
|
||||
BinaryIO,
|
||||
Iterator,
|
||||
Mapping,
|
||||
Optional,
|
||||
Sequence,
|
||||
TextIO,
|
||||
)
|
||||
|
||||
PROGNAME = 'pdfform-extract'
|
||||
logger = logging.getLogger('conservancy_beancount.pdfforms.extract')
|
||||
|
||||
class FormExtractor:
|
||||
def __init__(
|
||||
self,
|
||||
pdf: PDFDocument,
|
||||
form_key: Optional[str]=None,
|
||||
source: Optional[str]=None,
|
||||
) -> None:
|
||||
if form_key is None:
|
||||
form_key = pdfutils.guess_form_key(pdf)
|
||||
self.document = pdf
|
||||
self.form_key = form_key
|
||||
self.source = source
|
||||
|
||||
@classmethod
|
||||
@contextlib.contextmanager
|
||||
def from_path(
|
||||
cls,
|
||||
path: Path,
|
||||
form_key: Optional[str]=None,
|
||||
) -> Iterator['FormExtractor']:
|
||||
pdf_file = path.open('rb')
|
||||
try:
|
||||
yield cls.from_file(pdf_file, form_key, path)
|
||||
finally:
|
||||
pdf_file.close()
|
||||
|
||||
@classmethod
|
||||
def from_file(
|
||||
cls,
|
||||
source: BinaryIO,
|
||||
form_key: Optional[str]=None,
|
||||
source_path: Optional[Path]=None,
|
||||
) -> 'FormExtractor':
|
||||
if source_path is None:
|
||||
source_path = Path(source.name)
|
||||
parser = PDFParser(source)
|
||||
pdf_doc = PDFDocument(parser)
|
||||
return cls(pdf_doc, form_key, source_path.name)
|
||||
|
||||
def _extract_field(
|
||||
self,
|
||||
field: fieldmod.FormField,
|
||||
name_prefix: str='',
|
||||
) -> Iterator[Mapping[str, Any]]:
|
||||
name = name_prefix + field.name()
|
||||
yield_this = not field.is_readonly()
|
||||
try:
|
||||
field_type = field.field_type().name
|
||||
except ValueError:
|
||||
yield_this = False
|
||||
if yield_this:
|
||||
retval = {
|
||||
'fdf': {
|
||||
'type': field_type,
|
||||
'name': name,
|
||||
},
|
||||
'description': f'{field_type} {name}',
|
||||
'value': field.fill_value(),
|
||||
}
|
||||
if isinstance(field, fieldmod.CheckboxField):
|
||||
retval['fdf']['options'] = field.options()
|
||||
yield retval
|
||||
name += '.'
|
||||
for kid in field.kids():
|
||||
yield from self._extract_field(kid, name)
|
||||
|
||||
def extract(self) -> Mapping[str, Any]:
|
||||
return {
|
||||
'from file': self.source,
|
||||
'form key': self.form_key,
|
||||
'fields': [
|
||||
field
|
||||
for field_source in resolve1(self.document.catalog[self.form_key])['Fields']
|
||||
for field in self._extract_field(fieldmod.FormField.by_type(resolve1(field_source)))
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
class FormYAMLDumper(yaml.dumper.SafeDumper):
|
||||
def represent_mapping(self, tag: Any, value: Any, flow_style: Any=None) -> Any:
|
||||
if flow_style is None:
|
||||
# We never want mappings flowed by default.
|
||||
flow_style = False
|
||||
# If the super method can call value.items(), it does that and re-sorts
|
||||
# the result. We don't want re-sorted output, so call value.items() now
|
||||
# as a bypass.
|
||||
try:
|
||||
value = value.items()
|
||||
except AttributeError:
|
||||
pass
|
||||
return super().represent_mapping(tag, value, flow_style)
|
||||
|
||||
|
||||
def parse_arguments(arglist: Optional[Sequence[str]]=None) -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(prog=PROGNAME)
|
||||
cliutil.add_version_argument(parser)
|
||||
cliutil.add_loglevel_argument(parser)
|
||||
parser.add_argument(
|
||||
'--form-key', '-f',
|
||||
metavar='KEY',
|
||||
help="""Key in the document catalog with form data.
|
||||
Default is guessed by examining the document.
|
||||
""")
|
||||
parser.add_argument(
|
||||
'--output-file', '-O',
|
||||
metavar='PATH',
|
||||
type=Path,
|
||||
help="""Write output YAML to this file, or stdout when PATH is `-`.
|
||||
Default stdout.
|
||||
""")
|
||||
parser.add_argument(
|
||||
'document',
|
||||
type=Path,
|
||||
help="""PDF or FDF file to extract form data from.
|
||||
Use `-` to read from stdin.
|
||||
""")
|
||||
return parser.parse_args(arglist)
|
||||
|
||||
def main(arglist: Optional[Sequence[str]]=None,
|
||||
stdout: TextIO=sys.stdout,
|
||||
stderr: TextIO=sys.stderr,
|
||||
) -> int:
|
||||
args = parse_arguments(arglist)
|
||||
cliutil.set_loglevel(logger, args.loglevel)
|
||||
with contextlib.ExitStack() as exit_stack:
|
||||
if args.document == cliutil.STDSTREAM_PATH:
|
||||
extractor = FormExtractor.from_file(sys.stdin.buffer, args.form_key)
|
||||
else:
|
||||
extractor = exit_stack.enter_context(
|
||||
FormExtractor.from_path(args.document, args.form_key),
|
||||
)
|
||||
extracted_form = extractor.extract()
|
||||
with contextlib.ExitStack() as exit_stack:
|
||||
out_file = cliutil.text_output(args.output_file, stdout)
|
||||
if out_file is not stdout:
|
||||
exit_stack.enter_context(out_file)
|
||||
yaml.dump(extracted_form, out_file, Dumper=FormYAMLDumper)
|
||||
return 0
|
||||
|
||||
entry_point = cliutil.make_entry_point(__name__, PROGNAME)
|
||||
|
||||
if __name__ == '__main__':
|
||||
exit(entry_point())
|
245
conservancy_beancount/pdfforms/fields.py
Normal file
245
conservancy_beancount/pdfforms/fields.py
Normal file
|
@ -0,0 +1,245 @@
|
|||
"""fields.py - Python classes to read and write PDF form data"""
|
||||
# Copyright © 2020 Brett Smith
|
||||
# License: AGPLv3-or-later WITH Beancount-Plugin-Additional-Permission-1.0
|
||||
#
|
||||
# Full copyright and licensing details can be found at toplevel file
|
||||
# LICENSE.txt in the repository.
|
||||
|
||||
import enum
|
||||
import functools
|
||||
|
||||
from pdfminer.pdftypes import resolve1 # type:ignore[import]
|
||||
from pdfminer import psparser # type:ignore[import]
|
||||
from . import utils as pdfutils
|
||||
from .errors import PDFKeyError, PDFSpecError
|
||||
|
||||
from typing import (
|
||||
Any,
|
||||
Iterator,
|
||||
Optional,
|
||||
Mapping,
|
||||
MutableMapping,
|
||||
Sequence,
|
||||
Tuple,
|
||||
Union,
|
||||
)
|
||||
|
||||
FieldSource = MutableMapping[str, Any]
|
||||
|
||||
class FieldFlags(enum.IntFlag):
|
||||
# Flags for all fields
|
||||
ReadOnly = 2 ** 0
|
||||
Required = 2 ** 1
|
||||
NoExport = 2 ** 2
|
||||
# Flags for buttons
|
||||
NoToggleToOff = 2 ** 14
|
||||
Radio = 2 ** 15
|
||||
Pushbutton = 2 ** 16
|
||||
RadiosInUnison = 2 ** 25
|
||||
# Flags for text
|
||||
Multiline = 2 ** 12
|
||||
Password = 2 ** 13
|
||||
FileSelect = 2 ** 20
|
||||
DoNotSpellCheck = 2 ** 22
|
||||
DoNotScroll = 2 ** 23
|
||||
Comb = 2 ** 24
|
||||
RichText = 2 ** 25
|
||||
|
||||
|
||||
class FieldType(enum.Enum):
|
||||
Btn = 'Btn'
|
||||
BUTTON = Btn
|
||||
Ch = 'Ch'
|
||||
CHOICE = Ch
|
||||
Sig = 'Sig'
|
||||
SIG = Sig
|
||||
SIGNATURE = Sig
|
||||
Tx = 'Tx'
|
||||
TEXT = Tx
|
||||
|
||||
|
||||
class FormField:
|
||||
__slots__ = ['_source']
|
||||
_SENTINEL = object()
|
||||
DEFAULT_FILL: object = None
|
||||
INHERITABLE = frozenset([
|
||||
'DV',
|
||||
'Ff',
|
||||
'FT',
|
||||
'MaxLen',
|
||||
'Opt',
|
||||
'V',
|
||||
])
|
||||
|
||||
def __init__(self, source: FieldSource) -> None:
|
||||
self._source = source
|
||||
|
||||
@classmethod
|
||||
def by_type(cls, source: FieldSource) -> 'FormField':
|
||||
retval = cls(source)
|
||||
try:
|
||||
field_type = retval.field_type()
|
||||
except ValueError:
|
||||
return retval
|
||||
flags = retval.flags()
|
||||
if field_type is FieldType.BUTTON:
|
||||
if flags & FieldFlags.Radio:
|
||||
pass
|
||||
elif flags & FieldFlags.Pushbutton:
|
||||
pass
|
||||
else:
|
||||
retval.__class__ = CheckboxField
|
||||
elif field_type is FieldType.TEXT:
|
||||
retval.__class__ = TextField
|
||||
return retval
|
||||
|
||||
def _get_value(self, key: str, default: Any=_SENTINEL) -> Any:
|
||||
can_inherit = key in self.INHERITABLE
|
||||
source: Optional[FieldSource] = self._source
|
||||
while source is not None:
|
||||
try:
|
||||
return resolve1(source[key])
|
||||
except KeyError:
|
||||
source = resolve1(source.get('Parent')) if can_inherit else None
|
||||
if default is self._SENTINEL:
|
||||
raise PDFKeyError(key)
|
||||
else:
|
||||
return default
|
||||
|
||||
def field_type(self) -> FieldType:
|
||||
try:
|
||||
source = self._get_value('FT')
|
||||
except KeyError:
|
||||
raise PDFSpecError("field does not specify a field type") from None
|
||||
try:
|
||||
return FieldType[source.name]
|
||||
except (AttributeError, KeyError):
|
||||
raise PDFSpecError(f"field has invalid field type {source!r}") from None
|
||||
|
||||
def kids(self) -> Iterator['FormField']:
|
||||
for source in self._get_value('Kids', ()):
|
||||
yield self.by_type(resolve1(source))
|
||||
|
||||
def parent(self) -> Optional['FormField']:
|
||||
try:
|
||||
return self.by_type(self._get_value('Parent'))
|
||||
except KeyError:
|
||||
return None
|
||||
|
||||
def is_terminal(self) -> bool:
|
||||
return not self._get_value('Kids', None)
|
||||
|
||||
def flags(self) -> int:
|
||||
return self._get_value('Ff', 0) # type:ignore[no-any-return]
|
||||
|
||||
def is_readonly(self) -> bool:
|
||||
return bool(self.flags() & FieldFlags.ReadOnly)
|
||||
|
||||
def name(self) -> str:
|
||||
return pdfutils.decode_text(self._get_value('T', b''))
|
||||
|
||||
def value(self) -> Any:
|
||||
return self._get_value('V', None) # type:ignore[no-any-return]
|
||||
|
||||
def set_value(self, value: Any) -> None:
|
||||
self._source['V'] = value
|
||||
|
||||
def fill_value(self) -> Any:
|
||||
return resolve1(self._source.get('V', self.DEFAULT_FILL))
|
||||
|
||||
def as_filled_fdf(self) -> Mapping[str, Any]:
|
||||
retval: FieldSource = {}
|
||||
try:
|
||||
retval['T'] = pdfutils.decode_text(self._source['T'])
|
||||
except KeyError:
|
||||
pass
|
||||
value = self.fill_value()
|
||||
if value is not None:
|
||||
retval['V'] = value
|
||||
kids = [kid.as_filled_fdf() for kid in self.kids()]
|
||||
if kids:
|
||||
retval['Kids'] = kids
|
||||
return retval
|
||||
|
||||
def as_mapping(self, name_prefix: str='') -> Iterator[Tuple[str, 'FormField']]:
|
||||
name = name_prefix + self.name()
|
||||
yield (name, self)
|
||||
name += '.'
|
||||
for kid in self.kids():
|
||||
yield from kid.as_mapping(name)
|
||||
|
||||
|
||||
class CheckboxField(FormField):
|
||||
__slots__: Sequence[str] = []
|
||||
OFF = 'Off'
|
||||
ON = 'Yes'
|
||||
|
||||
@functools.lru_cache()
|
||||
def options(self) -> Sequence[str]:
|
||||
try:
|
||||
keys: Tuple[str, ...] = tuple(self._source['AP']['N'])
|
||||
except KeyError:
|
||||
keys = ()
|
||||
count = len(keys)
|
||||
if count == 0:
|
||||
return [self.ON, self.OFF]
|
||||
elif count == 1:
|
||||
return [keys[0], self.OFF]
|
||||
elif count > 2:
|
||||
raise PDFSpecError("checkbox has more than two states available")
|
||||
try:
|
||||
off_index = keys.index(self.OFF)
|
||||
except ValueError:
|
||||
try:
|
||||
off_index = 0 if keys.index(self.ON) else 1
|
||||
except ValueError:
|
||||
raise PDFSpecError("checkbox defines two on states") from None
|
||||
return [keys[0 if off_index else 1], keys[off_index]]
|
||||
|
||||
def _bool_value(self, literal_value: Optional[psparser.PSLiteral]) -> Optional[bool]:
|
||||
if literal_value is None:
|
||||
return None
|
||||
try:
|
||||
value = literal_value.name
|
||||
except AttributeError:
|
||||
raise PDFSpecError("checkbox value is not a PSLiteral")
|
||||
on, off = self.options()
|
||||
if value == on:
|
||||
return True
|
||||
elif value == off:
|
||||
return False
|
||||
else:
|
||||
raise PDFSpecError(f"checkbox has unknown value {value!r}")
|
||||
|
||||
def value(self) -> Optional[bool]:
|
||||
return self._bool_value(super().value())
|
||||
|
||||
def set_value(self, value: Optional[bool]) -> None:
|
||||
if value is None:
|
||||
literal_value: Optional[psparser.PSLiteral] = None
|
||||
else:
|
||||
on, off = self.options()
|
||||
literal_value = psparser.PSLiteralTable.intern(on if value else off)
|
||||
super().set_value(literal_value)
|
||||
|
||||
|
||||
class TextField(FormField):
|
||||
__slots__: Sequence[str] = []
|
||||
DEFAULT_FILL = b''
|
||||
|
||||
def _decode(self, value: Any) -> Optional[str]:
|
||||
if value is None:
|
||||
return value
|
||||
elif isinstance(value, bytes):
|
||||
return pdfutils.decode_text(value)
|
||||
else:
|
||||
raise PDFSpecError("text field value is not bytes")
|
||||
|
||||
def value(self) -> Optional[str]:
|
||||
return self._decode(super().value())
|
||||
|
||||
def set_value(self, value: Optional[str]) -> None:
|
||||
super().set_value(None if value is None else pdfutils.encode_text(value))
|
||||
|
||||
def fill_value(self) -> Optional[str]:
|
||||
return self._decode(super().fill_value())
|
53
conservancy_beancount/pdfforms/utils.py
Normal file
53
conservancy_beancount/pdfforms/utils.py
Normal file
|
@ -0,0 +1,53 @@
|
|||
"""utils.py - Utility methods for working with PDFs"""
|
||||
# Copyright © 2020 Brett Smith
|
||||
# License: AGPLv3-or-later WITH Beancount-Plugin-Additional-Permission-1.0
|
||||
#
|
||||
# Full copyright and licensing details can be found at toplevel file
|
||||
# LICENSE.txt in the repository.
|
||||
|
||||
from codecs import BOM_UTF16_BE
|
||||
|
||||
import pdfminer.utils # type:ignore[import]
|
||||
|
||||
from . import errors as pdferrors
|
||||
|
||||
from pdfminer.pdfdocument import PDFDocument # type:ignore[import]
|
||||
from pdfminer.pdftypes import resolve1 # type:ignore[import]
|
||||
|
||||
from typing import (
|
||||
Callable,
|
||||
)
|
||||
|
||||
decode_text: Callable[[bytes], str] = pdfminer.utils.decode_text
|
||||
|
||||
def encode_text(s: str) -> bytes:
|
||||
"""Encode a string to bytes for PDF
|
||||
|
||||
If possible, encodes to ASCII for readability and compactness.
|
||||
Otherwise uses UTF-16BE.
|
||||
"""
|
||||
try:
|
||||
return s.encode('ascii')
|
||||
except UnicodeEncodeError:
|
||||
return BOM_UTF16_BE + s.encode('utf-16be')
|
||||
|
||||
def guess_form_key(pdf: PDFDocument) -> str:
|
||||
"""Guess and return the PDF document catalog key with form data
|
||||
|
||||
This function knows common catalog keys that hold PDF form data,
|
||||
searches the given document for form data, and returns the best candidate.
|
||||
Raises ValueError
|
||||
"""
|
||||
catalog = pdf.catalog
|
||||
for key in [
|
||||
'AcroForm',
|
||||
'FDF',
|
||||
]:
|
||||
try:
|
||||
'Fields' in resolve1(catalog[key])
|
||||
except (KeyError, TypeError):
|
||||
pass
|
||||
else:
|
||||
return key
|
||||
else:
|
||||
raise pdferrors.NoFormDataError("could not find catalog key with form data")
|
3
setup.py
3
setup.py
|
@ -16,6 +16,7 @@ setup(
|
|||
'GitPython>=2.0', # Debian:python3-git
|
||||
# 1.4.1 crashes when trying to save some documents.
|
||||
'odfpy>=1.4.0,!=1.4.1', # Debian:python3-odf
|
||||
'pdfminer.six>=20200101',
|
||||
'PyYAML>=3.0', # Debian:python3-yaml
|
||||
'regex', # Debian:python3-regex
|
||||
'rt>=2.0',
|
||||
|
@ -31,6 +32,7 @@ setup(
|
|||
|
||||
packages=[
|
||||
'conservancy_beancount',
|
||||
'conservancy_beancount.pdfforms',
|
||||
'conservancy_beancount.plugin',
|
||||
'conservancy_beancount.reports',
|
||||
'conservancy_beancount.tools',
|
||||
|
@ -46,6 +48,7 @@ setup(
|
|||
'fund-report = conservancy_beancount.reports.fund:entry_point',
|
||||
'ledger-report = conservancy_beancount.reports.ledger:entry_point',
|
||||
'opening-balances = conservancy_beancount.tools.opening_balances:entry_point',
|
||||
'pdfform-extract = conservancy_beancount.pdfforms.extract:entry_point',
|
||||
'split-ods-links = conservancy_beancount.tools.split_ods_links:entry_point',
|
||||
],
|
||||
},
|
||||
|
|
72
tests/pdfforms/form1.fdf
Normal file
72
tests/pdfforms/form1.fdf
Normal file
|
@ -0,0 +1,72 @@
|
|||
%FDF-1.2
|
||||
%âãÏÓ
|
||||
1 0 obj
|
||||
<<
|
||||
/FDF
|
||||
<<
|
||||
/Fields [
|
||||
<<
|
||||
/T (topform)
|
||||
/Kids [
|
||||
<<
|
||||
/T (text1_0)
|
||||
/FT /Tx
|
||||
/V ()
|
||||
>>
|
||||
<<
|
||||
/T (button1)
|
||||
/Kids [
|
||||
<<
|
||||
/FT /Btn
|
||||
/T (button1_0)
|
||||
/AP << /N << /1 1 0 R >> >>
|
||||
>>
|
||||
<<
|
||||
/FT /Btn
|
||||
/T (button1_1)
|
||||
/AP << /N << /2 1 0 R >> >>
|
||||
>>
|
||||
]
|
||||
>>
|
||||
<<
|
||||
/T (text1_1)
|
||||
/FT /Tx
|
||||
/V ()
|
||||
>>
|
||||
<<
|
||||
/T (text2_0)
|
||||
/FT /Tx
|
||||
/V ()
|
||||
>>
|
||||
<<
|
||||
/T (button2)
|
||||
/Kids [
|
||||
<<
|
||||
/FT /Btn
|
||||
/T (button2_0)
|
||||
/AP << /N << /1 1 0 R >> >>
|
||||
>>
|
||||
<<
|
||||
/FT /Btn
|
||||
/T (button2_1)
|
||||
/AP << /N << /2 1 0 R >> >>
|
||||
>>
|
||||
]
|
||||
>>
|
||||
<<
|
||||
% Readonly
|
||||
/T (text2_R)
|
||||
/FT /Tx
|
||||
/Ff 1
|
||||
>>
|
||||
]
|
||||
>>]
|
||||
>>
|
||||
>>
|
||||
endobj
|
||||
trailer
|
||||
|
||||
<<
|
||||
/Root 1 0 R
|
||||
>>
|
||||
%%EOF
|
25
tests/pdfforms/form1.yml
Normal file
25
tests/pdfforms/form1.yml
Normal file
|
@ -0,0 +1,25 @@
|
|||
- fdf:
|
||||
type: Tx
|
||||
name: topform.text1_0
|
||||
- fdf:
|
||||
type: Btn
|
||||
name: topform.button1.button1_0
|
||||
options: ['1', 'Off']
|
||||
- fdf:
|
||||
type: Btn
|
||||
name: topform.button1.button1_1
|
||||
options: ['2', 'Off']
|
||||
- fdf:
|
||||
type: Tx
|
||||
name: topform.text1_1
|
||||
- fdf:
|
||||
type: Tx
|
||||
name: topform.text2_0
|
||||
- fdf:
|
||||
type: Btn
|
||||
name: topform.button2.button2_0
|
||||
options: ['1', 'Off']
|
||||
- fdf:
|
||||
type: Btn
|
||||
name: topform.button2.button2_1
|
||||
options: ['2', 'Off']
|
62
tests/test_pdfforms_extract.py
Normal file
62
tests/test_pdfforms_extract.py
Normal file
|
@ -0,0 +1,62 @@
|
|||
"""test_pdfforms_extract.py - Unit tests for PDF form extractor"""
|
||||
# Copyright © 2020 Brett Smith
|
||||
# License: AGPLv3-or-later WITH Beancount-Plugin-Additional-Permission-1.0
|
||||
#
|
||||
# Full copyright and licensing details can be found at toplevel file
|
||||
# LICENSE.txt in the repository.
|
||||
|
||||
import io
|
||||
import itertools
|
||||
|
||||
import pytest
|
||||
import yaml
|
||||
|
||||
from . import testutil
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from conservancy_beancount.pdfforms import extract as extractmod
|
||||
|
||||
def compare_to_yaml(actual, yaml_path, from_file, form_key):
|
||||
if isinstance(yaml_path, str):
|
||||
yaml_path = testutil.test_path(f'pdfforms/{yaml_path}')
|
||||
with yaml_path.open() as yaml_file:
|
||||
expect_fields = yaml.safe_load(yaml_file)
|
||||
assert actual.get('from file') == from_file
|
||||
assert actual.get('form key') == form_key
|
||||
for act_f, exp_f in itertools.zip_longest(actual.get('fields', ()), expect_fields):
|
||||
for key, exp_value in exp_f.items():
|
||||
assert act_f[key] == exp_value
|
||||
|
||||
@pytest.mark.parametrize('fdf_filename,form_key,fields_yaml', [
|
||||
('form1.fdf', 'FDF', 'form1.yml'),
|
||||
])
|
||||
def test_extract_from_path(fdf_filename, form_key, fields_yaml):
|
||||
fdf_path = testutil.test_path(f'pdfforms/{fdf_filename}')
|
||||
with extractmod.FormExtractor.from_path(fdf_path) as extractor:
|
||||
actual = extractor.extract()
|
||||
compare_to_yaml(actual, fields_yaml, fdf_filename, form_key)
|
||||
|
||||
@pytest.mark.parametrize('fdf_filename,form_key,fields_yaml', [
|
||||
('form1.fdf', 'FDF', 'form1.yml'),
|
||||
])
|
||||
def test_extract_from_file(fdf_filename, form_key, fields_yaml):
|
||||
with testutil.test_path(f'pdfforms/{fdf_filename}').open('rb') as fdf_file:
|
||||
extractor = extractmod.FormExtractor.from_file(fdf_file)
|
||||
actual = extractor.extract()
|
||||
compare_to_yaml(actual, fields_yaml, fdf_filename, form_key)
|
||||
|
||||
@pytest.mark.parametrize('fdf_filename,form_key,fields_yaml', [
|
||||
('form1.fdf', 'FDF', 'form1.yml'),
|
||||
])
|
||||
def test_main(fdf_filename, form_key, fields_yaml):
|
||||
fdf_path = testutil.test_path(f'pdfforms/{fdf_filename}')
|
||||
arglist = [str(fdf_path)]
|
||||
stdout = io.StringIO()
|
||||
stderr = io.StringIO()
|
||||
returncode = extractmod.main(arglist, stdout, stderr)
|
||||
assert returncode == 0
|
||||
assert not stderr.getvalue()
|
||||
stdout.seek(0)
|
||||
actual = yaml.safe_load(stdout)
|
||||
compare_to_yaml(actual, fields_yaml, fdf_filename, form_key)
|
350
tests/test_pdfforms_fields.py
Normal file
350
tests/test_pdfforms_fields.py
Normal file
|
@ -0,0 +1,350 @@
|
|||
"""test_pdfforms_fields.py - Unit tests for PDF forms manipulation"""
|
||||
# Copyright © 2020 Brett Smith
|
||||
# License: AGPLv3-or-later WITH Beancount-Plugin-Additional-Permission-1.0
|
||||
#
|
||||
# Full copyright and licensing details can be found at toplevel file
|
||||
# LICENSE.txt in the repository.
|
||||
|
||||
import codecs
|
||||
import itertools
|
||||
|
||||
import pytest
|
||||
|
||||
from pdfminer.psparser import PSLiteral
|
||||
|
||||
from conservancy_beancount.pdfforms import fields as fieldsmod
|
||||
|
||||
def field_source(
|
||||
name=None,
|
||||
value=None,
|
||||
field_type=None,
|
||||
flags=None,
|
||||
parent=None,
|
||||
kids=None,
|
||||
*,
|
||||
literal=None,
|
||||
):
|
||||
retval = {}
|
||||
if isinstance(name, str):
|
||||
retval['T'] = name.encode('ascii')
|
||||
elif name is not None:
|
||||
retval['T'] = name
|
||||
if value is not None:
|
||||
if literal is None:
|
||||
literal = field_type and field_type != 'Tx'
|
||||
if literal:
|
||||
value = PSLiteral(value)
|
||||
retval['V'] = value
|
||||
if field_type is not None:
|
||||
retval['FT'] = PSLiteral(field_type)
|
||||
if flags is not None:
|
||||
retval['Ff'] = flags
|
||||
if parent is not None:
|
||||
retval['Parent'] = parent
|
||||
if kids is not None:
|
||||
retval['Kids'] = list(kids)
|
||||
return retval
|
||||
|
||||
def appearance_states(*names):
|
||||
return {key: object() for key in names if key is not None}
|
||||
|
||||
def test_empty_field():
|
||||
source = field_source()
|
||||
field = fieldsmod.FormField(source)
|
||||
assert not field.name()
|
||||
assert field.value() is None
|
||||
assert field.parent() is None
|
||||
assert not list(field.kids())
|
||||
assert field.flags() == 0
|
||||
assert field.is_terminal()
|
||||
with pytest.raises(ValueError):
|
||||
field.field_type()
|
||||
|
||||
def test_text_field_base():
|
||||
source = field_source(b's', b'string of text', 'Tx')
|
||||
field = fieldsmod.FormField(source)
|
||||
assert field.field_type() is fieldsmod.FieldType.TEXT
|
||||
assert field.name() == 's'
|
||||
assert field.value() == b'string of text'
|
||||
|
||||
@pytest.mark.parametrize('value', ['Off', 'Yes', 'On'])
|
||||
def test_checkbox_field_base(value):
|
||||
source = field_source(b'cb', value, 'Btn', literal=True)
|
||||
field = fieldsmod.FormField(source)
|
||||
assert field.field_type() is fieldsmod.FieldType.BUTTON
|
||||
assert field.name() == 'cb'
|
||||
assert field.value().name == value
|
||||
|
||||
@pytest.mark.parametrize('flags', range(4))
|
||||
def test_readonly_flag(flags):
|
||||
source = field_source(flags=flags)
|
||||
field = fieldsmod.FormField(source)
|
||||
assert field.flags() == flags
|
||||
assert field.is_readonly() == flags % 2
|
||||
|
||||
@pytest.mark.parametrize('kid_count', range(3))
|
||||
def test_kids(kid_count):
|
||||
kids = [field_source(f'kid{n}', field_type='Ch') for n in range(kid_count)]
|
||||
source = field_source(kids=iter(kids))
|
||||
field = fieldsmod.FormField(source)
|
||||
got_kids = list(field.kids())
|
||||
assert len(got_kids) == len(kids)
|
||||
assert field.is_terminal() == (not kids)
|
||||
for actual, expected in zip(got_kids, kids):
|
||||
assert actual.name() == expected['T'].decode('ascii')
|
||||
|
||||
def test_kids_by_type():
|
||||
kids = [field_source(field_type='Tx'), field_source(field_type='Btn')]
|
||||
source = field_source('topform', kids=iter(kids))
|
||||
actual = fieldsmod.FormField.by_type(source).kids()
|
||||
assert isinstance(next(actual), fieldsmod.TextField)
|
||||
assert isinstance(next(actual), fieldsmod.CheckboxField)
|
||||
assert next(actual, None) is None
|
||||
|
||||
def test_inheritance():
|
||||
parent_source = field_source(b'parent', 'parent value', 'Tx', 17)
|
||||
kid_source = field_source('kid', parent=parent_source)
|
||||
parent_source['Kids'] = [kid_source]
|
||||
field = fieldsmod.FormField(kid_source)
|
||||
parent = field.parent()
|
||||
assert parent is not None
|
||||
assert parent.name() == 'parent'
|
||||
assert not parent.is_terminal()
|
||||
assert field.is_terminal()
|
||||
assert field.name() == 'kid'
|
||||
assert field.field_type() is fieldsmod.FieldType.TEXT
|
||||
assert field.value() == 'parent value'
|
||||
assert field.flags() == 17
|
||||
assert not list(field.kids())
|
||||
|
||||
@pytest.mark.parametrize('field_type,value', [
|
||||
('Tx', b'new value'),
|
||||
('Btn', PSLiteral('Yes')),
|
||||
])
|
||||
def test_set_value(field_type, value):
|
||||
source = field_source(field_type=field_type)
|
||||
field = fieldsmod.FormField(source)
|
||||
assert field.value() is None
|
||||
field.set_value(value)
|
||||
assert field.value() == value
|
||||
|
||||
@pytest.mark.parametrize('field_type,expected', [
|
||||
('Tx', fieldsmod.TextField),
|
||||
('Btn', fieldsmod.CheckboxField),
|
||||
])
|
||||
def test_by_type(field_type, expected):
|
||||
source = field_source(field_type=field_type)
|
||||
field = fieldsmod.FormField.by_type(source)
|
||||
assert isinstance(field, expected)
|
||||
|
||||
def test_container_by_type():
|
||||
kids = [field_source(field_type='Tx'), field_source(field_type='Btn')]
|
||||
source = field_source('topform', kids=iter(kids))
|
||||
field = fieldsmod.FormField.by_type(source)
|
||||
assert isinstance(field, fieldsmod.FormField)
|
||||
|
||||
@pytest.mark.parametrize('flag', [
|
||||
# If you add dedicated classes for these types of buttons, you can remove
|
||||
# their test cases.
|
||||
fieldsmod.FieldFlags.Radio,
|
||||
fieldsmod.FieldFlags.Pushbutton,
|
||||
])
|
||||
def test_unsupported_button_by_type(flag):
|
||||
source = field_source(field_type='Btn', flags=flag)
|
||||
field = fieldsmod.FormField.by_type(source)
|
||||
assert type(field) is fieldsmod.FormField
|
||||
|
||||
@pytest.mark.parametrize('field_type', [
|
||||
# If you add dedicated classes for these types of fields, you can remove
|
||||
# their test cases.
|
||||
'Ch',
|
||||
'Sig',
|
||||
])
|
||||
def test_unsupported_field_by_type(field_type):
|
||||
source = field_source(field_type=field_type)
|
||||
field = fieldsmod.FormField.by_type(source)
|
||||
assert type(field) is fieldsmod.FormField
|
||||
|
||||
@pytest.mark.parametrize('value', [None, 'Off', 'Yes'])
|
||||
def test_checkbox_value(value):
|
||||
source = field_source('cb', value, 'Btn', literal=True)
|
||||
field = fieldsmod.CheckboxField(source)
|
||||
assert field.value() == (value and value == 'Yes')
|
||||
|
||||
@pytest.mark.parametrize('value,expected', [
|
||||
(None, None),
|
||||
(False, 'Off'),
|
||||
(True, 'Yes'),
|
||||
])
|
||||
def test_checkbox_set_value(value, expected):
|
||||
source = field_source('cb', field_type='Btn')
|
||||
field = fieldsmod.CheckboxField(source)
|
||||
field.set_value(value)
|
||||
actual = fieldsmod.FormField.value(field)
|
||||
if expected is None:
|
||||
assert actual is None
|
||||
else:
|
||||
assert actual.name == expected
|
||||
|
||||
@pytest.mark.parametrize('on_key,off_key', itertools.product(
|
||||
['1', '2', 'On', 'Yes'],
|
||||
['Off', None],
|
||||
))
|
||||
def test_checkbox_options(on_key, off_key):
|
||||
source = field_source('cb', field_type='Btn')
|
||||
source['AP'] = {'N': appearance_states(on_key, off_key)}
|
||||
field = fieldsmod.CheckboxField(source)
|
||||
assert field.options() == [on_key, 'Off']
|
||||
|
||||
def test_checkbox_options_yes_no():
|
||||
# I'm not sure this is actually allowed under the spec, but…
|
||||
expected = ['Yes', 'No']
|
||||
source = field_source('cb', field_type='Btn')
|
||||
source['AP'] = {'N': appearance_states(*expected)}
|
||||
field = fieldsmod.CheckboxField(source)
|
||||
assert field.options() == expected
|
||||
|
||||
@pytest.mark.parametrize('on_key,off_key,set_value', itertools.product(
|
||||
['1', '2', 'On', 'Yes'],
|
||||
['Off', None],
|
||||
[True, False, None],
|
||||
))
|
||||
def test_checkbox_set_custom_value(on_key, off_key, set_value):
|
||||
source = field_source('cb', field_type='Btn')
|
||||
source['AP'] = {'N': appearance_states(on_key, off_key)}
|
||||
field = fieldsmod.CheckboxField(source)
|
||||
field.set_value(set_value)
|
||||
actual = fieldsmod.FormField.value(field)
|
||||
if set_value is None:
|
||||
assert actual is None
|
||||
elif set_value:
|
||||
assert actual.name == (on_key or 'Yes')
|
||||
else:
|
||||
assert actual.name == 'Off'
|
||||
|
||||
@pytest.mark.parametrize('encoding,prefix', [
|
||||
('ascii', b''),
|
||||
('utf-16be', codecs.BOM_UTF16_BE),
|
||||
])
|
||||
def test_text_value(encoding, prefix):
|
||||
expected = f'{encoding} encoding test'
|
||||
value = prefix + expected.encode(encoding)
|
||||
source = field_source('t', value, 'Tx')
|
||||
field = fieldsmod.TextField(source)
|
||||
assert field.value() == expected
|
||||
|
||||
def test_text_value_none():
|
||||
source = field_source(field_type='Tx')
|
||||
assert fieldsmod.TextField(source).value() is None
|
||||
|
||||
@pytest.mark.parametrize('text,bprefix', [
|
||||
('ASCII test', b''),
|
||||
('UTF—16 test', codecs.BOM_UTF16_BE),
|
||||
])
|
||||
def test_text_set_value(text, bprefix):
|
||||
source = field_source(field_type='Tx')
|
||||
field = fieldsmod.TextField(source)
|
||||
field.set_value(text)
|
||||
assert field.value() == text
|
||||
actual = fieldsmod.FormField.value(field)
|
||||
assert actual == bprefix + text.encode('utf-16be' if bprefix else 'ascii')
|
||||
|
||||
def test_text_set_value_none():
|
||||
source = field_source('t', b'set None test', 'Tx')
|
||||
field = fieldsmod.TextField(source)
|
||||
field.set_value(None)
|
||||
assert fieldsmod.FormField.value(field) is None
|
||||
|
||||
def test_empty_as_filled_fdf():
|
||||
source = field_source()
|
||||
field = fieldsmod.FormField(source)
|
||||
assert field.as_filled_fdf() == {}
|
||||
|
||||
@pytest.mark.parametrize('field_type,field_class,set_value', [
|
||||
('Btn', fieldsmod.CheckboxField, True),
|
||||
('Btn', fieldsmod.CheckboxField, False),
|
||||
('Ch', fieldsmod.FormField, None),
|
||||
('Tx', fieldsmod.TextField, 'export test'),
|
||||
('Tx', fieldsmod.TextField, 'UTF—16 export'),
|
||||
])
|
||||
def test_as_filled_fdf_after_set_value(field_type, field_class, set_value):
|
||||
source = field_source(field_type, field_type=field_type)
|
||||
field = field_class(source)
|
||||
field.set_value(set_value)
|
||||
actual = field.as_filled_fdf()
|
||||
assert actual['T'] == field_type
|
||||
expect_len = 2
|
||||
if set_value is None:
|
||||
assert 'V' not in actual
|
||||
expect_len = 1
|
||||
elif field_class is fieldsmod.CheckboxField:
|
||||
assert actual['V'].name == ('Yes' if set_value else 'Off')
|
||||
else:
|
||||
assert actual['V'] == set_value
|
||||
assert len(actual) == expect_len
|
||||
|
||||
@pytest.mark.parametrize('field_type,expected', [
|
||||
('Btn', None),
|
||||
('Tx', ''),
|
||||
])
|
||||
def test_as_filled_fdf_default_value(field_type, expected):
|
||||
source = field_source(field_type=field_type)
|
||||
field = fieldsmod.FormField.by_type(source)
|
||||
actual = field.as_filled_fdf()
|
||||
assert actual.get('V') == expected
|
||||
|
||||
def test_as_filled_fdf_recursion():
|
||||
buttons = [field_source(f'bt{n}', field_type='Btn') for n in range(1, 3)]
|
||||
pair = field_source('Buttons', kids=iter(buttons))
|
||||
text = field_source('tx', field_type='Tx')
|
||||
source = field_source('topform', kids=[text, pair])
|
||||
field = fieldsmod.FormField(source)
|
||||
actual = field.as_filled_fdf()
|
||||
assert actual['T'] == 'topform'
|
||||
assert 'V' not in actual
|
||||
actual = iter(actual['Kids'])
|
||||
assert next(actual)['T'] == 'tx'
|
||||
actual = next(actual)
|
||||
assert actual['T'] == 'Buttons'
|
||||
assert 'V' not in actual
|
||||
actual = iter(actual['Kids'])
|
||||
assert next(actual)['T'] == 'bt1'
|
||||
assert next(actual)['T'] == 'bt2'
|
||||
assert next(actual, None) is None
|
||||
|
||||
@pytest.mark.parametrize('name,value,field_type', [
|
||||
(None, None, None),
|
||||
('mt', 'mapping text', 'Tx'),
|
||||
('mb', 'Yes', 'Btn'),
|
||||
])
|
||||
def test_simple_as_mapping(name, value, field_type):
|
||||
source = field_source(name, value, field_type)
|
||||
field = fieldsmod.FormField(source)
|
||||
actual = field.as_mapping()
|
||||
key, mapped = next(actual)
|
||||
assert key == (name or '')
|
||||
assert mapped is field
|
||||
assert next(actual, None) is None
|
||||
|
||||
def test_recursive_as_mapping():
|
||||
btn_kids = [field_source(f'btn{n}', field_type='Btn') for n in range(1, 3)]
|
||||
buttons = field_source('buttons', kids=iter(btn_kids))
|
||||
text_kids = [field_source(f'tx{n}', field_type='Tx') for n in range(1, 3)]
|
||||
texts = field_source('texts', kids=iter(text_kids))
|
||||
source = field_source('root', kids=[texts, buttons])
|
||||
root_field = fieldsmod.FormField(source)
|
||||
actual = root_field.as_mapping()
|
||||
for expected_key in [
|
||||
'root',
|
||||
'root.texts',
|
||||
'root.texts.tx1',
|
||||
'root.texts.tx2',
|
||||
'root.buttons',
|
||||
'root.buttons.btn1',
|
||||
'root.buttons.btn2',
|
||||
]:
|
||||
key, field = next(actual)
|
||||
assert key == expected_key
|
||||
_, _, expected_name = expected_key.rpartition('.')
|
||||
assert field.name() == expected_name
|
||||
assert next(actual, None) is None
|
Loading…
Reference in a new issue