pdfforms: Initial module and tool to extract PDF form data to YAML.

Next steps:

* A tool to fill the PDF form based on values written to that YAML.
* An extension to fill some of those values with numbers queried from the
  books (which is why we need something more involved than FDF).
This commit is contained in:
Brett Smith 2021-01-04 16:10:11 -05:00
parent 1b7fdf4f3b
commit 13c66e8ce2
10 changed files with 1007 additions and 0 deletions

View file

@ -0,0 +1,18 @@
"""errors.py - Exception classes for PDF reporting errors"""
# Copyright © 2021 Brett Smith
# License: AGPLv3-or-later WITH Beancount-Plugin-Additional-Permission-1.0
#
# Full copyright and licensing details can be found at toplevel file
# LICENSE.txt in the repository.
class PDFError(Exception):
pass
class PDFKeyError(KeyError, PDFError):
pass
class PDFSpecError(ValueError, PDFError):
pass
class NoFormDataError(ValueError, PDFError):
pass

View file

@ -0,0 +1,179 @@
"""extract.py - Extract form data from PDF files"""
# Copyright © 2021 Brett Smith
# License: AGPLv3-or-later WITH Beancount-Plugin-Additional-Permission-1.0
#
# Full copyright and licensing details can be found at toplevel file
# LICENSE.txt in the repository.
import argparse
import contextlib
import logging
import sys
import yaml
from . import fields as fieldmod
from . import utils as pdfutils
from .. import cliutil
from pathlib import Path
from pdfminer.pdfdocument import PDFDocument # type:ignore[import]
from pdfminer.pdfparser import PDFParser # type:ignore[import]
from pdfminer.pdftypes import resolve1 # type:ignore[import]
from typing import (
Any,
BinaryIO,
Iterator,
Mapping,
Optional,
Sequence,
TextIO,
)
PROGNAME = 'pdfform-extract'
logger = logging.getLogger('conservancy_beancount.pdfforms.extract')
class FormExtractor:
def __init__(
self,
pdf: PDFDocument,
form_key: Optional[str]=None,
source: Optional[str]=None,
) -> None:
if form_key is None:
form_key = pdfutils.guess_form_key(pdf)
self.document = pdf
self.form_key = form_key
self.source = source
@classmethod
@contextlib.contextmanager
def from_path(
cls,
path: Path,
form_key: Optional[str]=None,
) -> Iterator['FormExtractor']:
pdf_file = path.open('rb')
try:
yield cls.from_file(pdf_file, form_key, path)
finally:
pdf_file.close()
@classmethod
def from_file(
cls,
source: BinaryIO,
form_key: Optional[str]=None,
source_path: Optional[Path]=None,
) -> 'FormExtractor':
if source_path is None:
source_path = Path(source.name)
parser = PDFParser(source)
pdf_doc = PDFDocument(parser)
return cls(pdf_doc, form_key, source_path.name)
def _extract_field(
self,
field: fieldmod.FormField,
name_prefix: str='',
) -> Iterator[Mapping[str, Any]]:
name = name_prefix + field.name()
yield_this = not field.is_readonly()
try:
field_type = field.field_type().name
except ValueError:
yield_this = False
if yield_this:
retval = {
'fdf': {
'type': field_type,
'name': name,
},
'description': f'{field_type} {name}',
'value': field.fill_value(),
}
if isinstance(field, fieldmod.CheckboxField):
retval['fdf']['options'] = field.options()
yield retval
name += '.'
for kid in field.kids():
yield from self._extract_field(kid, name)
def extract(self) -> Mapping[str, Any]:
return {
'from file': self.source,
'form key': self.form_key,
'fields': [
field
for field_source in resolve1(self.document.catalog[self.form_key])['Fields']
for field in self._extract_field(fieldmod.FormField.by_type(resolve1(field_source)))
],
}
class FormYAMLDumper(yaml.dumper.SafeDumper):
def represent_mapping(self, tag: Any, value: Any, flow_style: Any=None) -> Any:
if flow_style is None:
# We never want mappings flowed by default.
flow_style = False
# If the super method can call value.items(), it does that and re-sorts
# the result. We don't want re-sorted output, so call value.items() now
# as a bypass.
try:
value = value.items()
except AttributeError:
pass
return super().represent_mapping(tag, value, flow_style)
def parse_arguments(arglist: Optional[Sequence[str]]=None) -> argparse.Namespace:
parser = argparse.ArgumentParser(prog=PROGNAME)
cliutil.add_version_argument(parser)
cliutil.add_loglevel_argument(parser)
parser.add_argument(
'--form-key', '-f',
metavar='KEY',
help="""Key in the document catalog with form data.
Default is guessed by examining the document.
""")
parser.add_argument(
'--output-file', '-O',
metavar='PATH',
type=Path,
help="""Write output YAML to this file, or stdout when PATH is `-`.
Default stdout.
""")
parser.add_argument(
'document',
type=Path,
help="""PDF or FDF file to extract form data from.
Use `-` to read from stdin.
""")
return parser.parse_args(arglist)
def main(arglist: Optional[Sequence[str]]=None,
stdout: TextIO=sys.stdout,
stderr: TextIO=sys.stderr,
) -> int:
args = parse_arguments(arglist)
cliutil.set_loglevel(logger, args.loglevel)
with contextlib.ExitStack() as exit_stack:
if args.document == cliutil.STDSTREAM_PATH:
extractor = FormExtractor.from_file(sys.stdin.buffer, args.form_key)
else:
extractor = exit_stack.enter_context(
FormExtractor.from_path(args.document, args.form_key),
)
extracted_form = extractor.extract()
with contextlib.ExitStack() as exit_stack:
out_file = cliutil.text_output(args.output_file, stdout)
if out_file is not stdout:
exit_stack.enter_context(out_file)
yaml.dump(extracted_form, out_file, Dumper=FormYAMLDumper)
return 0
entry_point = cliutil.make_entry_point(__name__, PROGNAME)
if __name__ == '__main__':
exit(entry_point())

View file

@ -0,0 +1,245 @@
"""fields.py - Python classes to read and write PDF form data"""
# Copyright © 2020 Brett Smith
# License: AGPLv3-or-later WITH Beancount-Plugin-Additional-Permission-1.0
#
# Full copyright and licensing details can be found at toplevel file
# LICENSE.txt in the repository.
import enum
import functools
from pdfminer.pdftypes import resolve1 # type:ignore[import]
from pdfminer import psparser # type:ignore[import]
from . import utils as pdfutils
from .errors import PDFKeyError, PDFSpecError
from typing import (
Any,
Iterator,
Optional,
Mapping,
MutableMapping,
Sequence,
Tuple,
Union,
)
FieldSource = MutableMapping[str, Any]
class FieldFlags(enum.IntFlag):
# Flags for all fields
ReadOnly = 2 ** 0
Required = 2 ** 1
NoExport = 2 ** 2
# Flags for buttons
NoToggleToOff = 2 ** 14
Radio = 2 ** 15
Pushbutton = 2 ** 16
RadiosInUnison = 2 ** 25
# Flags for text
Multiline = 2 ** 12
Password = 2 ** 13
FileSelect = 2 ** 20
DoNotSpellCheck = 2 ** 22
DoNotScroll = 2 ** 23
Comb = 2 ** 24
RichText = 2 ** 25
class FieldType(enum.Enum):
Btn = 'Btn'
BUTTON = Btn
Ch = 'Ch'
CHOICE = Ch
Sig = 'Sig'
SIG = Sig
SIGNATURE = Sig
Tx = 'Tx'
TEXT = Tx
class FormField:
__slots__ = ['_source']
_SENTINEL = object()
DEFAULT_FILL: object = None
INHERITABLE = frozenset([
'DV',
'Ff',
'FT',
'MaxLen',
'Opt',
'V',
])
def __init__(self, source: FieldSource) -> None:
self._source = source
@classmethod
def by_type(cls, source: FieldSource) -> 'FormField':
retval = cls(source)
try:
field_type = retval.field_type()
except ValueError:
return retval
flags = retval.flags()
if field_type is FieldType.BUTTON:
if flags & FieldFlags.Radio:
pass
elif flags & FieldFlags.Pushbutton:
pass
else:
retval.__class__ = CheckboxField
elif field_type is FieldType.TEXT:
retval.__class__ = TextField
return retval
def _get_value(self, key: str, default: Any=_SENTINEL) -> Any:
can_inherit = key in self.INHERITABLE
source: Optional[FieldSource] = self._source
while source is not None:
try:
return resolve1(source[key])
except KeyError:
source = resolve1(source.get('Parent')) if can_inherit else None
if default is self._SENTINEL:
raise PDFKeyError(key)
else:
return default
def field_type(self) -> FieldType:
try:
source = self._get_value('FT')
except KeyError:
raise PDFSpecError("field does not specify a field type") from None
try:
return FieldType[source.name]
except (AttributeError, KeyError):
raise PDFSpecError(f"field has invalid field type {source!r}") from None
def kids(self) -> Iterator['FormField']:
for source in self._get_value('Kids', ()):
yield self.by_type(resolve1(source))
def parent(self) -> Optional['FormField']:
try:
return self.by_type(self._get_value('Parent'))
except KeyError:
return None
def is_terminal(self) -> bool:
return not self._get_value('Kids', None)
def flags(self) -> int:
return self._get_value('Ff', 0) # type:ignore[no-any-return]
def is_readonly(self) -> bool:
return bool(self.flags() & FieldFlags.ReadOnly)
def name(self) -> str:
return pdfutils.decode_text(self._get_value('T', b''))
def value(self) -> Any:
return self._get_value('V', None) # type:ignore[no-any-return]
def set_value(self, value: Any) -> None:
self._source['V'] = value
def fill_value(self) -> Any:
return resolve1(self._source.get('V', self.DEFAULT_FILL))
def as_filled_fdf(self) -> Mapping[str, Any]:
retval: FieldSource = {}
try:
retval['T'] = pdfutils.decode_text(self._source['T'])
except KeyError:
pass
value = self.fill_value()
if value is not None:
retval['V'] = value
kids = [kid.as_filled_fdf() for kid in self.kids()]
if kids:
retval['Kids'] = kids
return retval
def as_mapping(self, name_prefix: str='') -> Iterator[Tuple[str, 'FormField']]:
name = name_prefix + self.name()
yield (name, self)
name += '.'
for kid in self.kids():
yield from kid.as_mapping(name)
class CheckboxField(FormField):
__slots__: Sequence[str] = []
OFF = 'Off'
ON = 'Yes'
@functools.lru_cache()
def options(self) -> Sequence[str]:
try:
keys: Tuple[str, ...] = tuple(self._source['AP']['N'])
except KeyError:
keys = ()
count = len(keys)
if count == 0:
return [self.ON, self.OFF]
elif count == 1:
return [keys[0], self.OFF]
elif count > 2:
raise PDFSpecError("checkbox has more than two states available")
try:
off_index = keys.index(self.OFF)
except ValueError:
try:
off_index = 0 if keys.index(self.ON) else 1
except ValueError:
raise PDFSpecError("checkbox defines two on states") from None
return [keys[0 if off_index else 1], keys[off_index]]
def _bool_value(self, literal_value: Optional[psparser.PSLiteral]) -> Optional[bool]:
if literal_value is None:
return None
try:
value = literal_value.name
except AttributeError:
raise PDFSpecError("checkbox value is not a PSLiteral")
on, off = self.options()
if value == on:
return True
elif value == off:
return False
else:
raise PDFSpecError(f"checkbox has unknown value {value!r}")
def value(self) -> Optional[bool]:
return self._bool_value(super().value())
def set_value(self, value: Optional[bool]) -> None:
if value is None:
literal_value: Optional[psparser.PSLiteral] = None
else:
on, off = self.options()
literal_value = psparser.PSLiteralTable.intern(on if value else off)
super().set_value(literal_value)
class TextField(FormField):
__slots__: Sequence[str] = []
DEFAULT_FILL = b''
def _decode(self, value: Any) -> Optional[str]:
if value is None:
return value
elif isinstance(value, bytes):
return pdfutils.decode_text(value)
else:
raise PDFSpecError("text field value is not bytes")
def value(self) -> Optional[str]:
return self._decode(super().value())
def set_value(self, value: Optional[str]) -> None:
super().set_value(None if value is None else pdfutils.encode_text(value))
def fill_value(self) -> Optional[str]:
return self._decode(super().fill_value())

View file

@ -0,0 +1,53 @@
"""utils.py - Utility methods for working with PDFs"""
# Copyright © 2020 Brett Smith
# License: AGPLv3-or-later WITH Beancount-Plugin-Additional-Permission-1.0
#
# Full copyright and licensing details can be found at toplevel file
# LICENSE.txt in the repository.
from codecs import BOM_UTF16_BE
import pdfminer.utils # type:ignore[import]
from . import errors as pdferrors
from pdfminer.pdfdocument import PDFDocument # type:ignore[import]
from pdfminer.pdftypes import resolve1 # type:ignore[import]
from typing import (
Callable,
)
decode_text: Callable[[bytes], str] = pdfminer.utils.decode_text
def encode_text(s: str) -> bytes:
"""Encode a string to bytes for PDF
If possible, encodes to ASCII for readability and compactness.
Otherwise uses UTF-16BE.
"""
try:
return s.encode('ascii')
except UnicodeEncodeError:
return BOM_UTF16_BE + s.encode('utf-16be')
def guess_form_key(pdf: PDFDocument) -> str:
"""Guess and return the PDF document catalog key with form data
This function knows common catalog keys that hold PDF form data,
searches the given document for form data, and returns the best candidate.
Raises ValueError
"""
catalog = pdf.catalog
for key in [
'AcroForm',
'FDF',
]:
try:
'Fields' in resolve1(catalog[key])
except (KeyError, TypeError):
pass
else:
return key
else:
raise pdferrors.NoFormDataError("could not find catalog key with form data")

View file

@ -16,6 +16,7 @@ setup(
'GitPython>=2.0', # Debian:python3-git
# 1.4.1 crashes when trying to save some documents.
'odfpy>=1.4.0,!=1.4.1', # Debian:python3-odf
'pdfminer.six>=20200101',
'PyYAML>=3.0', # Debian:python3-yaml
'regex', # Debian:python3-regex
'rt>=2.0',
@ -31,6 +32,7 @@ setup(
packages=[
'conservancy_beancount',
'conservancy_beancount.pdfforms',
'conservancy_beancount.plugin',
'conservancy_beancount.reports',
'conservancy_beancount.tools',
@ -46,6 +48,7 @@ setup(
'fund-report = conservancy_beancount.reports.fund:entry_point',
'ledger-report = conservancy_beancount.reports.ledger:entry_point',
'opening-balances = conservancy_beancount.tools.opening_balances:entry_point',
'pdfform-extract = conservancy_beancount.pdfforms.extract:entry_point',
'split-ods-links = conservancy_beancount.tools.split_ods_links:entry_point',
],
},

72
tests/pdfforms/form1.fdf Normal file
View file

@ -0,0 +1,72 @@
%FDF-1.2
%âãÏÓ
1 0 obj
<<
/FDF
<<
/Fields [
<<
/T (topform)
/Kids [
<<
/T (text1_0)
/FT /Tx
/V ()
>>
<<
/T (button1)
/Kids [
<<
/FT /Btn
/T (button1_0)
/AP << /N << /1 1 0 R >> >>
>>
<<
/FT /Btn
/T (button1_1)
/AP << /N << /2 1 0 R >> >>
>>
]
>>
<<
/T (text1_1)
/FT /Tx
/V ()
>>
<<
/T (text2_0)
/FT /Tx
/V ()
>>
<<
/T (button2)
/Kids [
<<
/FT /Btn
/T (button2_0)
/AP << /N << /1 1 0 R >> >>
>>
<<
/FT /Btn
/T (button2_1)
/AP << /N << /2 1 0 R >> >>
>>
]
>>
<<
% Readonly
/T (text2_R)
/FT /Tx
/Ff 1
>>
]
>>]
>>
>>
endobj
trailer
<<
/Root 1 0 R
>>
%%EOF

25
tests/pdfforms/form1.yml Normal file
View file

@ -0,0 +1,25 @@
- fdf:
type: Tx
name: topform.text1_0
- fdf:
type: Btn
name: topform.button1.button1_0
options: ['1', 'Off']
- fdf:
type: Btn
name: topform.button1.button1_1
options: ['2', 'Off']
- fdf:
type: Tx
name: topform.text1_1
- fdf:
type: Tx
name: topform.text2_0
- fdf:
type: Btn
name: topform.button2.button2_0
options: ['1', 'Off']
- fdf:
type: Btn
name: topform.button2.button2_1
options: ['2', 'Off']

View file

@ -0,0 +1,62 @@
"""test_pdfforms_extract.py - Unit tests for PDF form extractor"""
# Copyright © 2020 Brett Smith
# License: AGPLv3-or-later WITH Beancount-Plugin-Additional-Permission-1.0
#
# Full copyright and licensing details can be found at toplevel file
# LICENSE.txt in the repository.
import io
import itertools
import pytest
import yaml
from . import testutil
from pathlib import Path
from conservancy_beancount.pdfforms import extract as extractmod
def compare_to_yaml(actual, yaml_path, from_file, form_key):
if isinstance(yaml_path, str):
yaml_path = testutil.test_path(f'pdfforms/{yaml_path}')
with yaml_path.open() as yaml_file:
expect_fields = yaml.safe_load(yaml_file)
assert actual.get('from file') == from_file
assert actual.get('form key') == form_key
for act_f, exp_f in itertools.zip_longest(actual.get('fields', ()), expect_fields):
for key, exp_value in exp_f.items():
assert act_f[key] == exp_value
@pytest.mark.parametrize('fdf_filename,form_key,fields_yaml', [
('form1.fdf', 'FDF', 'form1.yml'),
])
def test_extract_from_path(fdf_filename, form_key, fields_yaml):
fdf_path = testutil.test_path(f'pdfforms/{fdf_filename}')
with extractmod.FormExtractor.from_path(fdf_path) as extractor:
actual = extractor.extract()
compare_to_yaml(actual, fields_yaml, fdf_filename, form_key)
@pytest.mark.parametrize('fdf_filename,form_key,fields_yaml', [
('form1.fdf', 'FDF', 'form1.yml'),
])
def test_extract_from_file(fdf_filename, form_key, fields_yaml):
with testutil.test_path(f'pdfforms/{fdf_filename}').open('rb') as fdf_file:
extractor = extractmod.FormExtractor.from_file(fdf_file)
actual = extractor.extract()
compare_to_yaml(actual, fields_yaml, fdf_filename, form_key)
@pytest.mark.parametrize('fdf_filename,form_key,fields_yaml', [
('form1.fdf', 'FDF', 'form1.yml'),
])
def test_main(fdf_filename, form_key, fields_yaml):
fdf_path = testutil.test_path(f'pdfforms/{fdf_filename}')
arglist = [str(fdf_path)]
stdout = io.StringIO()
stderr = io.StringIO()
returncode = extractmod.main(arglist, stdout, stderr)
assert returncode == 0
assert not stderr.getvalue()
stdout.seek(0)
actual = yaml.safe_load(stdout)
compare_to_yaml(actual, fields_yaml, fdf_filename, form_key)

View file

@ -0,0 +1,350 @@
"""test_pdfforms_fields.py - Unit tests for PDF forms manipulation"""
# Copyright © 2020 Brett Smith
# License: AGPLv3-or-later WITH Beancount-Plugin-Additional-Permission-1.0
#
# Full copyright and licensing details can be found at toplevel file
# LICENSE.txt in the repository.
import codecs
import itertools
import pytest
from pdfminer.psparser import PSLiteral
from conservancy_beancount.pdfforms import fields as fieldsmod
def field_source(
name=None,
value=None,
field_type=None,
flags=None,
parent=None,
kids=None,
*,
literal=None,
):
retval = {}
if isinstance(name, str):
retval['T'] = name.encode('ascii')
elif name is not None:
retval['T'] = name
if value is not None:
if literal is None:
literal = field_type and field_type != 'Tx'
if literal:
value = PSLiteral(value)
retval['V'] = value
if field_type is not None:
retval['FT'] = PSLiteral(field_type)
if flags is not None:
retval['Ff'] = flags
if parent is not None:
retval['Parent'] = parent
if kids is not None:
retval['Kids'] = list(kids)
return retval
def appearance_states(*names):
return {key: object() for key in names if key is not None}
def test_empty_field():
source = field_source()
field = fieldsmod.FormField(source)
assert not field.name()
assert field.value() is None
assert field.parent() is None
assert not list(field.kids())
assert field.flags() == 0
assert field.is_terminal()
with pytest.raises(ValueError):
field.field_type()
def test_text_field_base():
source = field_source(b's', b'string of text', 'Tx')
field = fieldsmod.FormField(source)
assert field.field_type() is fieldsmod.FieldType.TEXT
assert field.name() == 's'
assert field.value() == b'string of text'
@pytest.mark.parametrize('value', ['Off', 'Yes', 'On'])
def test_checkbox_field_base(value):
source = field_source(b'cb', value, 'Btn', literal=True)
field = fieldsmod.FormField(source)
assert field.field_type() is fieldsmod.FieldType.BUTTON
assert field.name() == 'cb'
assert field.value().name == value
@pytest.mark.parametrize('flags', range(4))
def test_readonly_flag(flags):
source = field_source(flags=flags)
field = fieldsmod.FormField(source)
assert field.flags() == flags
assert field.is_readonly() == flags % 2
@pytest.mark.parametrize('kid_count', range(3))
def test_kids(kid_count):
kids = [field_source(f'kid{n}', field_type='Ch') for n in range(kid_count)]
source = field_source(kids=iter(kids))
field = fieldsmod.FormField(source)
got_kids = list(field.kids())
assert len(got_kids) == len(kids)
assert field.is_terminal() == (not kids)
for actual, expected in zip(got_kids, kids):
assert actual.name() == expected['T'].decode('ascii')
def test_kids_by_type():
kids = [field_source(field_type='Tx'), field_source(field_type='Btn')]
source = field_source('topform', kids=iter(kids))
actual = fieldsmod.FormField.by_type(source).kids()
assert isinstance(next(actual), fieldsmod.TextField)
assert isinstance(next(actual), fieldsmod.CheckboxField)
assert next(actual, None) is None
def test_inheritance():
parent_source = field_source(b'parent', 'parent value', 'Tx', 17)
kid_source = field_source('kid', parent=parent_source)
parent_source['Kids'] = [kid_source]
field = fieldsmod.FormField(kid_source)
parent = field.parent()
assert parent is not None
assert parent.name() == 'parent'
assert not parent.is_terminal()
assert field.is_terminal()
assert field.name() == 'kid'
assert field.field_type() is fieldsmod.FieldType.TEXT
assert field.value() == 'parent value'
assert field.flags() == 17
assert not list(field.kids())
@pytest.mark.parametrize('field_type,value', [
('Tx', b'new value'),
('Btn', PSLiteral('Yes')),
])
def test_set_value(field_type, value):
source = field_source(field_type=field_type)
field = fieldsmod.FormField(source)
assert field.value() is None
field.set_value(value)
assert field.value() == value
@pytest.mark.parametrize('field_type,expected', [
('Tx', fieldsmod.TextField),
('Btn', fieldsmod.CheckboxField),
])
def test_by_type(field_type, expected):
source = field_source(field_type=field_type)
field = fieldsmod.FormField.by_type(source)
assert isinstance(field, expected)
def test_container_by_type():
kids = [field_source(field_type='Tx'), field_source(field_type='Btn')]
source = field_source('topform', kids=iter(kids))
field = fieldsmod.FormField.by_type(source)
assert isinstance(field, fieldsmod.FormField)
@pytest.mark.parametrize('flag', [
# If you add dedicated classes for these types of buttons, you can remove
# their test cases.
fieldsmod.FieldFlags.Radio,
fieldsmod.FieldFlags.Pushbutton,
])
def test_unsupported_button_by_type(flag):
source = field_source(field_type='Btn', flags=flag)
field = fieldsmod.FormField.by_type(source)
assert type(field) is fieldsmod.FormField
@pytest.mark.parametrize('field_type', [
# If you add dedicated classes for these types of fields, you can remove
# their test cases.
'Ch',
'Sig',
])
def test_unsupported_field_by_type(field_type):
source = field_source(field_type=field_type)
field = fieldsmod.FormField.by_type(source)
assert type(field) is fieldsmod.FormField
@pytest.mark.parametrize('value', [None, 'Off', 'Yes'])
def test_checkbox_value(value):
source = field_source('cb', value, 'Btn', literal=True)
field = fieldsmod.CheckboxField(source)
assert field.value() == (value and value == 'Yes')
@pytest.mark.parametrize('value,expected', [
(None, None),
(False, 'Off'),
(True, 'Yes'),
])
def test_checkbox_set_value(value, expected):
source = field_source('cb', field_type='Btn')
field = fieldsmod.CheckboxField(source)
field.set_value(value)
actual = fieldsmod.FormField.value(field)
if expected is None:
assert actual is None
else:
assert actual.name == expected
@pytest.mark.parametrize('on_key,off_key', itertools.product(
['1', '2', 'On', 'Yes'],
['Off', None],
))
def test_checkbox_options(on_key, off_key):
source = field_source('cb', field_type='Btn')
source['AP'] = {'N': appearance_states(on_key, off_key)}
field = fieldsmod.CheckboxField(source)
assert field.options() == [on_key, 'Off']
def test_checkbox_options_yes_no():
# I'm not sure this is actually allowed under the spec, but…
expected = ['Yes', 'No']
source = field_source('cb', field_type='Btn')
source['AP'] = {'N': appearance_states(*expected)}
field = fieldsmod.CheckboxField(source)
assert field.options() == expected
@pytest.mark.parametrize('on_key,off_key,set_value', itertools.product(
['1', '2', 'On', 'Yes'],
['Off', None],
[True, False, None],
))
def test_checkbox_set_custom_value(on_key, off_key, set_value):
source = field_source('cb', field_type='Btn')
source['AP'] = {'N': appearance_states(on_key, off_key)}
field = fieldsmod.CheckboxField(source)
field.set_value(set_value)
actual = fieldsmod.FormField.value(field)
if set_value is None:
assert actual is None
elif set_value:
assert actual.name == (on_key or 'Yes')
else:
assert actual.name == 'Off'
@pytest.mark.parametrize('encoding,prefix', [
('ascii', b''),
('utf-16be', codecs.BOM_UTF16_BE),
])
def test_text_value(encoding, prefix):
expected = f'{encoding} encoding test'
value = prefix + expected.encode(encoding)
source = field_source('t', value, 'Tx')
field = fieldsmod.TextField(source)
assert field.value() == expected
def test_text_value_none():
source = field_source(field_type='Tx')
assert fieldsmod.TextField(source).value() is None
@pytest.mark.parametrize('text,bprefix', [
('ASCII test', b''),
('UTF—16 test', codecs.BOM_UTF16_BE),
])
def test_text_set_value(text, bprefix):
source = field_source(field_type='Tx')
field = fieldsmod.TextField(source)
field.set_value(text)
assert field.value() == text
actual = fieldsmod.FormField.value(field)
assert actual == bprefix + text.encode('utf-16be' if bprefix else 'ascii')
def test_text_set_value_none():
source = field_source('t', b'set None test', 'Tx')
field = fieldsmod.TextField(source)
field.set_value(None)
assert fieldsmod.FormField.value(field) is None
def test_empty_as_filled_fdf():
source = field_source()
field = fieldsmod.FormField(source)
assert field.as_filled_fdf() == {}
@pytest.mark.parametrize('field_type,field_class,set_value', [
('Btn', fieldsmod.CheckboxField, True),
('Btn', fieldsmod.CheckboxField, False),
('Ch', fieldsmod.FormField, None),
('Tx', fieldsmod.TextField, 'export test'),
('Tx', fieldsmod.TextField, 'UTF—16 export'),
])
def test_as_filled_fdf_after_set_value(field_type, field_class, set_value):
source = field_source(field_type, field_type=field_type)
field = field_class(source)
field.set_value(set_value)
actual = field.as_filled_fdf()
assert actual['T'] == field_type
expect_len = 2
if set_value is None:
assert 'V' not in actual
expect_len = 1
elif field_class is fieldsmod.CheckboxField:
assert actual['V'].name == ('Yes' if set_value else 'Off')
else:
assert actual['V'] == set_value
assert len(actual) == expect_len
@pytest.mark.parametrize('field_type,expected', [
('Btn', None),
('Tx', ''),
])
def test_as_filled_fdf_default_value(field_type, expected):
source = field_source(field_type=field_type)
field = fieldsmod.FormField.by_type(source)
actual = field.as_filled_fdf()
assert actual.get('V') == expected
def test_as_filled_fdf_recursion():
buttons = [field_source(f'bt{n}', field_type='Btn') for n in range(1, 3)]
pair = field_source('Buttons', kids=iter(buttons))
text = field_source('tx', field_type='Tx')
source = field_source('topform', kids=[text, pair])
field = fieldsmod.FormField(source)
actual = field.as_filled_fdf()
assert actual['T'] == 'topform'
assert 'V' not in actual
actual = iter(actual['Kids'])
assert next(actual)['T'] == 'tx'
actual = next(actual)
assert actual['T'] == 'Buttons'
assert 'V' not in actual
actual = iter(actual['Kids'])
assert next(actual)['T'] == 'bt1'
assert next(actual)['T'] == 'bt2'
assert next(actual, None) is None
@pytest.mark.parametrize('name,value,field_type', [
(None, None, None),
('mt', 'mapping text', 'Tx'),
('mb', 'Yes', 'Btn'),
])
def test_simple_as_mapping(name, value, field_type):
source = field_source(name, value, field_type)
field = fieldsmod.FormField(source)
actual = field.as_mapping()
key, mapped = next(actual)
assert key == (name or '')
assert mapped is field
assert next(actual, None) is None
def test_recursive_as_mapping():
btn_kids = [field_source(f'btn{n}', field_type='Btn') for n in range(1, 3)]
buttons = field_source('buttons', kids=iter(btn_kids))
text_kids = [field_source(f'tx{n}', field_type='Tx') for n in range(1, 3)]
texts = field_source('texts', kids=iter(text_kids))
source = field_source('root', kids=[texts, buttons])
root_field = fieldsmod.FormField(source)
actual = root_field.as_mapping()
for expected_key in [
'root',
'root.texts',
'root.texts.tx1',
'root.texts.tx2',
'root.buttons',
'root.buttons.btn1',
'root.buttons.btn2',
]:
key, field = next(actual)
assert key == expected_key
_, _, expected_name = expected_key.rpartition('.')
assert field.name() == expected_name
assert next(actual, None) is None