tools: Add extract-odf-links.
This commit is contained in:
parent
8597a526d7
commit
ce067963dc
4 changed files with 188 additions and 1 deletions
131
conservancy_beancount/tools/extract_odf_links.py
Normal file
131
conservancy_beancount/tools/extract_odf_links.py
Normal file
|
@ -0,0 +1,131 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""extract_odf_links.py - Tool to extract links from ODF documents
|
||||||
|
|
||||||
|
Given one or more ODF documents, this tool finds links that refer to the local
|
||||||
|
filesystem, and writes their full paths to stdout.
|
||||||
|
"""
|
||||||
|
# Copyright © 2020 Brett Smith
|
||||||
|
#
|
||||||
|
# This program is free software: you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU Affero General Public License as published by
|
||||||
|
# the Free Software Foundation, either version 3 of the License, or
|
||||||
|
# (at your option) any later version.
|
||||||
|
#
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU Affero General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU Affero General Public License
|
||||||
|
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import urllib.parse
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from zipfile import BadZipFile
|
||||||
|
|
||||||
|
import odf.opendocument # type:ignore[import]
|
||||||
|
import odf.text # type:ignore[import]
|
||||||
|
|
||||||
|
from typing import (
|
||||||
|
Iterator,
|
||||||
|
Optional,
|
||||||
|
Sequence,
|
||||||
|
Set,
|
||||||
|
TextIO,
|
||||||
|
)
|
||||||
|
|
||||||
|
from .. import cliutil
|
||||||
|
|
||||||
|
PROGNAME = 'extract-odf-links'
|
||||||
|
logger = logging.getLogger('conservancy_beancount.tools.extract_odf_links')
|
||||||
|
|
||||||
|
def parse_delimiter(arg: str) -> str:
|
||||||
|
try:
|
||||||
|
retval = eval('"{}"'.format(arg.replace('"', r'\"')), {})
|
||||||
|
except SyntaxError:
|
||||||
|
retval = None
|
||||||
|
if isinstance(retval, str):
|
||||||
|
return retval
|
||||||
|
else:
|
||||||
|
raise ValueError(f"not a valid string: {arg!r}")
|
||||||
|
|
||||||
|
def parse_arguments(arglist: Optional[Sequence[str]]=None) -> argparse.Namespace:
|
||||||
|
parser = argparse.ArgumentParser(prog=PROGNAME)
|
||||||
|
cliutil.add_version_argument(parser)
|
||||||
|
cliutil.add_loglevel_argument(parser)
|
||||||
|
parser.add_argument(
|
||||||
|
'--delimiter', '-d',
|
||||||
|
metavar='STR',
|
||||||
|
type=parse_delimiter,
|
||||||
|
default='\\n',
|
||||||
|
help="""String to output between links. Accepts all backslash escapes
|
||||||
|
supported in Python like \\n, \\t, \\0, \\u, etc. Default '\\n'.
|
||||||
|
""")
|
||||||
|
parser.add_argument(
|
||||||
|
'--zero', '--null', '-z', '-0',
|
||||||
|
action='store_const',
|
||||||
|
dest='delimiter',
|
||||||
|
const='\0',
|
||||||
|
help="""Shortcut for --delimiter=\\0
|
||||||
|
""")
|
||||||
|
parser.add_argument(
|
||||||
|
'odf_paths',
|
||||||
|
metavar='ODF_PATH',
|
||||||
|
type=Path,
|
||||||
|
nargs=argparse.ONE_OR_MORE,
|
||||||
|
help="""ODF file(s) to extract links from. Note that %(prog)s cannot
|
||||||
|
read from stdin because it needs to know document paths to resolve links.
|
||||||
|
""")
|
||||||
|
return parser.parse_args(arglist)
|
||||||
|
|
||||||
|
def extract_links(odf_path: Path) -> Iterator[Path]:
|
||||||
|
odf_root = odf_path.parent.resolve()
|
||||||
|
with odf_path.open('rb') as odf_file:
|
||||||
|
odf_doc = odf.opendocument.load(odf_file)
|
||||||
|
for a_elem in odf_doc.getElementsByType(odf.text.A):
|
||||||
|
parts = urllib.parse.urlparse(a_elem.getAttribute('href') or '')
|
||||||
|
if (parts.scheme and parts.scheme != 'file') or not parts.path:
|
||||||
|
continue
|
||||||
|
path = Path(urllib.parse.unquote(parts.path))
|
||||||
|
if not path.is_absolute():
|
||||||
|
path = (odf_path / path).resolve()
|
||||||
|
try:
|
||||||
|
path.relative_to(odf_root)
|
||||||
|
except ValueError:
|
||||||
|
logger.warning(f"link {path} is neither absolute nor relative to {odf_path}")
|
||||||
|
yield path
|
||||||
|
|
||||||
|
def main(arglist: Optional[Sequence[str]]=None,
|
||||||
|
stdout: TextIO=sys.stdout,
|
||||||
|
stderr: TextIO=sys.stderr,
|
||||||
|
) -> int:
|
||||||
|
args = parse_arguments(arglist)
|
||||||
|
cliutil.set_loglevel(logger, args.loglevel)
|
||||||
|
|
||||||
|
returncode = 0
|
||||||
|
links: Set[Path] = set()
|
||||||
|
for odf_path in args.odf_paths:
|
||||||
|
try:
|
||||||
|
links.update(extract_links(odf_path))
|
||||||
|
except IOError as error:
|
||||||
|
logger.error("error reading %s: %s", odf_path, error.strerror)
|
||||||
|
returncode = os.EX_DATAERR
|
||||||
|
except BadZipFile as error:
|
||||||
|
logger.error("error parsing %s: %s", odf_path, error.args[0])
|
||||||
|
returncode = os.EX_DATAERR
|
||||||
|
|
||||||
|
for link in links:
|
||||||
|
if not link.exists():
|
||||||
|
logger.warning("path %s not found", link)
|
||||||
|
print(link, end=args.delimiter, file=stdout)
|
||||||
|
return returncode
|
||||||
|
|
||||||
|
entry_point = cliutil.make_entry_point(__name__, PROGNAME)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
exit(entry_point())
|
3
setup.py
3
setup.py
|
@ -5,7 +5,7 @@ from setuptools import setup
|
||||||
setup(
|
setup(
|
||||||
name='conservancy_beancount',
|
name='conservancy_beancount',
|
||||||
description="Plugin, library, and reports for reading Conservancy's books",
|
description="Plugin, library, and reports for reading Conservancy's books",
|
||||||
version='1.6.4',
|
version='1.7.0',
|
||||||
author='Software Freedom Conservancy',
|
author='Software Freedom Conservancy',
|
||||||
author_email='info@sfconservancy.org',
|
author_email='info@sfconservancy.org',
|
||||||
license='GNU AGPLv3+',
|
license='GNU AGPLv3+',
|
||||||
|
@ -37,6 +37,7 @@ setup(
|
||||||
entry_points={
|
entry_points={
|
||||||
'console_scripts': [
|
'console_scripts': [
|
||||||
'accrual-report = conservancy_beancount.reports.accrual:entry_point',
|
'accrual-report = conservancy_beancount.reports.accrual:entry_point',
|
||||||
|
'extract-odf-links = conservancy_beancount.tools.extract_odf_links:entry_point',
|
||||||
'fund-report = conservancy_beancount.reports.fund:entry_point',
|
'fund-report = conservancy_beancount.reports.fund:entry_point',
|
||||||
'ledger-report = conservancy_beancount.reports.ledger:entry_point',
|
'ledger-report = conservancy_beancount.reports.ledger:entry_point',
|
||||||
'opening-balances = conservancy_beancount.tools.opening_balances:entry_point',
|
'opening-balances = conservancy_beancount.tools.opening_balances:entry_point',
|
||||||
|
|
BIN
tests/repository/LinksReport.ods
Normal file
BIN
tests/repository/LinksReport.ods
Normal file
Binary file not shown.
55
tests/test_extract_odf_links.py
Normal file
55
tests/test_extract_odf_links.py
Normal file
|
@ -0,0 +1,55 @@
|
||||||
|
"""test_extract_odf_links.py - Unit tests for ODF link extraction"""
|
||||||
|
# Copyright © 2020 Brett Smith
|
||||||
|
#
|
||||||
|
# This program is free software: you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU Affero General Public License as published by
|
||||||
|
# the Free Software Foundation, either version 3 of the License, or
|
||||||
|
# (at your option) any later version.
|
||||||
|
#
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU Affero General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU Affero General Public License
|
||||||
|
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
import io
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from . import testutil
|
||||||
|
|
||||||
|
from conservancy_beancount.tools import extract_odf_links
|
||||||
|
|
||||||
|
SRC_PATH = testutil.test_path('repository/LinksReport.ods')
|
||||||
|
|
||||||
|
EXPECTED_FILE_LINKS = {
|
||||||
|
'/repository/Projects/project-data.yml',
|
||||||
|
str(testutil.test_path('repository/Projects/project-data.yml')),
|
||||||
|
str(testutil.test_path('repository/Projects/Bad Link.txt')),
|
||||||
|
}
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('arglist,sep', [
|
||||||
|
(['-0'], '\0'),
|
||||||
|
(['-d', '\\v'], '\v'),
|
||||||
|
([str(SRC_PATH)], '\n'), # Test that links aren't duplicated
|
||||||
|
])
|
||||||
|
def test_extract_file_links(arglist, sep, caplog):
|
||||||
|
arglist.append(str(SRC_PATH))
|
||||||
|
stdout = io.StringIO()
|
||||||
|
stderr = io.StringIO()
|
||||||
|
exitcode = extract_odf_links.main(arglist, stdout, stderr)
|
||||||
|
assert exitcode == 0
|
||||||
|
assert not stderr.getvalue()
|
||||||
|
actual = stdout.getvalue().split(sep)
|
||||||
|
if actual and not actual[-1]:
|
||||||
|
actual.pop()
|
||||||
|
assert len(actual) == len(EXPECTED_FILE_LINKS)
|
||||||
|
assert set(actual) == EXPECTED_FILE_LINKS
|
||||||
|
assert caplog.records
|
||||||
|
assert any(
|
||||||
|
log.levelname == 'WARNING'
|
||||||
|
and log.message.endswith('/Bad Link.txt not found')
|
||||||
|
for log in caplog.records
|
||||||
|
)
|
Loading…
Reference in a new issue