tools: Add extract-odf-links.

2020-08-10 10:02:40 -04:00 · 2020-08-10 10:02:40 -04:00 · ce067963dc
commit ce067963dc
parent 8597a526d7
4 changed files with 188 additions and 1 deletions
--- a/conservancy_beancount/tools/extract_odf_links.py
+++ b/conservancy_beancount/tools/extract_odf_links.py
@ -0,0 +1,131 @@
+#!/usr/bin/env python3
+"""extract_odf_links.py - Tool to extract links from ODF documents
+
+Given one or more ODF documents, this tool finds links that refer to the local
+filesystem, and writes their full paths to stdout.
+"""
+# Copyright © 2020 Brett Smith
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+import argparse
+import logging
+import os
+import sys
+import urllib.parse
+
+from pathlib import Path
+from zipfile import BadZipFile
+
+import odf.opendocument  # type:ignore[import]
+import odf.text  # type:ignore[import]
+
+from typing import (
+    Iterator,
+    Optional,
+    Sequence,
+    Set,
+    TextIO,
+)
+
+from .. import cliutil
+
+PROGNAME = 'extract-odf-links'
+logger = logging.getLogger('conservancy_beancount.tools.extract_odf_links')
+
+def parse_delimiter(arg: str) -> str:
+    try:
+        retval = eval('"{}"'.format(arg.replace('"', r'\"')), {})
+    except SyntaxError:
+        retval = None
+    if isinstance(retval, str):
+        return retval
+    else:
+        raise ValueError(f"not a valid string: {arg!r}")
+
+def parse_arguments(arglist: Optional[Sequence[str]]=None) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(prog=PROGNAME)
+    cliutil.add_version_argument(parser)
+    cliutil.add_loglevel_argument(parser)
+    parser.add_argument(
+        '--delimiter', '-d',
+        metavar='STR',
+        type=parse_delimiter,
+        default='\\n',
+        help="""String to output between links. Accepts all backslash escapes
+supported in Python like \\n, \\t, \\0, \\u, etc. Default '\\n'.
+""")
+    parser.add_argument(
+        '--zero', '--null', '-z', '-0',
+        action='store_const',
+        dest='delimiter',
+        const='\0',
+        help="""Shortcut for --delimiter=\\0
+""")
+    parser.add_argument(
+        'odf_paths',
+        metavar='ODF_PATH',
+        type=Path,
+        nargs=argparse.ONE_OR_MORE,
+        help="""ODF file(s) to extract links from. Note that %(prog)s cannot
+read from stdin because it needs to know document paths to resolve links.
+""")
+    return parser.parse_args(arglist)
+
+def extract_links(odf_path: Path) -> Iterator[Path]:
+    odf_root = odf_path.parent.resolve()
+    with odf_path.open('rb') as odf_file:
+        odf_doc = odf.opendocument.load(odf_file)
+    for a_elem in odf_doc.getElementsByType(odf.text.A):
+        parts = urllib.parse.urlparse(a_elem.getAttribute('href') or '')
+        if (parts.scheme and parts.scheme != 'file') or not parts.path:
+            continue
+        path = Path(urllib.parse.unquote(parts.path))
+        if not path.is_absolute():
+            path = (odf_path / path).resolve()
+            try:
+                path.relative_to(odf_root)
+            except ValueError:
+                logger.warning(f"link {path} is neither absolute nor relative to {odf_path}")
+        yield path
+
+def main(arglist: Optional[Sequence[str]]=None,
+         stdout: TextIO=sys.stdout,
+         stderr: TextIO=sys.stderr,
+) -> int:
+    args = parse_arguments(arglist)
+    cliutil.set_loglevel(logger, args.loglevel)
+
+    returncode = 0
+    links: Set[Path] = set()
+    for odf_path in args.odf_paths:
+        try:
+            links.update(extract_links(odf_path))
+        except IOError as error:
+            logger.error("error reading %s: %s", odf_path, error.strerror)
+            returncode = os.EX_DATAERR
+        except BadZipFile as error:
+            logger.error("error parsing %s: %s", odf_path, error.args[0])
+            returncode = os.EX_DATAERR
+
+    for link in links:
+        if not link.exists():
+            logger.warning("path %s not found", link)
+        print(link, end=args.delimiter, file=stdout)
+    return returncode
+
+entry_point = cliutil.make_entry_point(__name__, PROGNAME)
+
+if __name__ == '__main__':
+    exit(entry_point())
--- a/setup.py
+++ b/setup.py
@ -5,7 +5,7 @@ from setuptools import setup
 setup(
    name='conservancy_beancount',
    description="Plugin, library, and reports for reading Conservancy's books",
-    version='1.6.4',
+    version='1.7.0',
    author='Software Freedom Conservancy',
    author_email='info@sfconservancy.org',
    license='GNU AGPLv3+',
@ -37,6 +37,7 @@ setup(
    entry_points={
        'console_scripts': [
            'accrual-report = conservancy_beancount.reports.accrual:entry_point',
+            'extract-odf-links = conservancy_beancount.tools.extract_odf_links:entry_point',
            'fund-report = conservancy_beancount.reports.fund:entry_point',
            'ledger-report = conservancy_beancount.reports.ledger:entry_point',
            'opening-balances = conservancy_beancount.tools.opening_balances:entry_point',
--- a/tests/repository/LinksReport.ods
+++ b/tests/repository/LinksReport.ods
--- a/tests/test_extract_odf_links.py
+++ b/tests/test_extract_odf_links.py
@ -0,0 +1,55 @@
+"""test_extract_odf_links.py - Unit tests for ODF link extraction"""
+# Copyright © 2020  Brett Smith
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+import io
+
+import pytest
+
+from . import testutil
+
+from conservancy_beancount.tools import extract_odf_links
+
+SRC_PATH = testutil.test_path('repository/LinksReport.ods')
+
+EXPECTED_FILE_LINKS = {
+    '/repository/Projects/project-data.yml',
+    str(testutil.test_path('repository/Projects/project-data.yml')),
+    str(testutil.test_path('repository/Projects/Bad Link.txt')),
+}
+
+@pytest.mark.parametrize('arglist,sep', [
+    (['-0'], '\0'),
+    (['-d', '\\v'], '\v'),
+    ([str(SRC_PATH)], '\n'),  # Test that links aren't duplicated
+])
+def test_extract_file_links(arglist, sep, caplog):
+    arglist.append(str(SRC_PATH))
+    stdout = io.StringIO()
+    stderr = io.StringIO()
+    exitcode = extract_odf_links.main(arglist, stdout, stderr)
+    assert exitcode == 0
+    assert not stderr.getvalue()
+    actual = stdout.getvalue().split(sep)
+    if actual and not actual[-1]:
+        actual.pop()
+    assert len(actual) == len(EXPECTED_FILE_LINKS)
+    assert set(actual) == EXPECTED_FILE_LINKS
+    assert caplog.records
+    assert any(
+        log.levelname == 'WARNING'
+        and log.message.endswith('/Bad Link.txt not found')
+        for log in caplog.records
+    )