extract_odf_links: Add --relative-to option.

This is just too convenient for our usual case where we expect all links to be relative to a specific directory (the repository).
2020-08-30 10:03:57 -04:00 · 2020-08-30 10:03:57 -04:00 · 2eba5a5546
commit 2eba5a5546
parent cf2d825a08
2 changed files with 54 additions and 15 deletions
--- a/conservancy_beancount/tools/extract_odf_links.py
+++ b/conservancy_beancount/tools/extract_odf_links.py
@ -72,29 +72,39 @@ supported in Python like \\n, \\t, \\0, \\u, etc. Default `%(default)s`.
        dest='delimiter',
        const='\0',
        help="""Shortcut for --delimiter=\\0
+""")
+    parser.add_argument(
+        '--relative-to', '-r',
+        metavar='PATH',
+        type=Path,
+        help="""Try to resolve all links relative to this path, rather than each
+spreadsheet's path
 """)
    parser.add_argument(
        'odf_paths',
        metavar='ODF_PATH',
        type=Path,
        nargs=argparse.ONE_OR_MORE,
-        help="""ODF file(s) to extract links from. Note that %(prog)s cannot
-read from stdin because it needs to know document paths to resolve links.
+        help="""ODF file(s) to extract links from
 """)
-    return parser.parse_args(arglist)
+    args = parser.parse_args(arglist)
+    if args.relative_to is None:
+        if any(path == cliutil.STDSTREAM_PATH for path in args.odf_paths):
+            parser.error("--relative-to is required to read from stdin")
+    elif args.relative_to.is_dir() or not args.relative_to.exists():
+        args.relative_to /= 'PathStub.ods'
+    return args

-def extract_links(odf_path: Path) -> Iterator[Path]:
-    with odf_path.open('rb') as odf_file:
-        odf_doc = odf.opendocument.load(odf_file)
+def extract_links(odf_doc: odf.opendocument.OpenDocument, rel_path: Path) -> Iterator[Path]:
    for a_elem in odf_doc.getElementsByType(odf.text.A):
        parts = urllib.parse.urlparse(a_elem.getAttribute('href') or '')
        if (parts.scheme and parts.scheme != 'file') or not parts.path:
            continue
        path = Path(urllib.parse.unquote(parts.path))
        if not path.is_absolute():
-            path = (odf_path / path).resolve()
+            path = (rel_path / path).resolve()
            try:
-                path.relative_to(odf_path)
+                path.relative_to(rel_path)
            except ValueError:
                pass
            else:
@ -113,7 +123,9 @@ def main(arglist: Optional[Sequence[str]]=None,
    links: Set[Path] = set()
    for odf_path in args.odf_paths:
        try:
-            links.update(extract_links(odf_path))
+            with cliutil.bytes_output(odf_path, sys.stdin, 'r') as odf_file:
+                odf_doc = odf.opendocument.load(odf_file)
+            links.update(extract_links(odf_doc, args.relative_to or odf_path))
        except IOError as error:
            logger.error("error reading %s: %s", odf_path, error.strerror)
            returncode = os.EX_DATAERR
--- a/tests/test_extract_odf_links.py
+++ b/tests/test_extract_odf_links.py
@ -15,6 +15,7 @@
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.

 import io
+import sys

 import pytest

@ -38,6 +39,14 @@ def expected_links(rel_path):
        for path in INCLUDED_FILE_LINKS
    )

+def check_output(stdout, sep, rel_path):
+    actual = stdout.getvalue().split(sep)
+    if actual and not actual[-1]:
+        actual.pop()
+    expected = expected_links(rel_path)
+    assert len(actual) == len(expected)
+    assert set(actual) == expected
+
@pytest.mark.parametrize('arglist,sep', [
    (['-0'], '\0'),
    (['-d', '\\v'], '\v'),
@ -50,15 +59,33 @@ def test_extract_file_links(arglist, sep, caplog):
    exitcode = extract_odf_links.main(arglist, stdout, stderr)
    assert exitcode == 0
    assert not stderr.getvalue()
-    actual = stdout.getvalue().split(sep)
-    if actual and not actual[-1]:
-        actual.pop()
-    expected = expected_links(SRC_PATH.parent)
-    assert len(actual) == len(expected)
-    assert set(actual) == expected
+    check_output(stdout, sep, SRC_PATH.parent)
    assert caplog.records
    assert any(
        log.levelname == 'WARNING'
        and log.message.endswith('/Bad Link.txt not found')
        for log in caplog.records
    )
+
+@pytest.mark.parametrize('rel_path', [
+    Path('/run'),
+    Path('/tmp'),
+])
+def test_extract_relative_to(rel_path):
+    arglist = ['--relative', str(rel_path), '-0', '-']
+    stdout = io.StringIO()
+    stderr = io.StringIO()
+    orig_stdin = sys.stdin
+    try:
+        sys.stdin = SRC_PATH.open('rb')
+        exitcode = extract_odf_links.main(arglist, stdout, stderr)
+    finally:
+        sys.stdin = orig_stdin
+    assert exitcode == 0
+    assert not stderr.getvalue()
+    check_output(stdout, '\0', rel_path)
+
+def test_reading_stdin_requires_relative_to():
+    with pytest.raises(SystemExit) as exc_check:
+        extract_odf_links.main(['-'])
+    assert exc_check.value.args[0] == 2