From ce067963dc6feaf40af7c41f69a4d64a0f5855b2 Mon Sep 17 00:00:00 2001 From: Brett Smith Date: Mon, 10 Aug 2020 10:02:40 -0400 Subject: [PATCH] tools: Add extract-odf-links. --- .../tools/extract_odf_links.py | 131 ++++++++++++++++++ setup.py | 3 +- tests/repository/LinksReport.ods | Bin 0 -> 9166 bytes tests/test_extract_odf_links.py | 55 ++++++++ 4 files changed, 188 insertions(+), 1 deletion(-) create mode 100644 conservancy_beancount/tools/extract_odf_links.py create mode 100644 tests/repository/LinksReport.ods create mode 100644 tests/test_extract_odf_links.py diff --git a/conservancy_beancount/tools/extract_odf_links.py b/conservancy_beancount/tools/extract_odf_links.py new file mode 100644 index 0000000..c71ac77 --- /dev/null +++ b/conservancy_beancount/tools/extract_odf_links.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python3 +"""extract_odf_links.py - Tool to extract links from ODF documents + +Given one or more ODF documents, this tool finds links that refer to the local +filesystem, and writes their full paths to stdout. +""" +# Copyright © 2020 Brett Smith +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import argparse +import logging +import os +import sys +import urllib.parse + +from pathlib import Path +from zipfile import BadZipFile + +import odf.opendocument # type:ignore[import] +import odf.text # type:ignore[import] + +from typing import ( + Iterator, + Optional, + Sequence, + Set, + TextIO, +) + +from .. import cliutil + +PROGNAME = 'extract-odf-links' +logger = logging.getLogger('conservancy_beancount.tools.extract_odf_links') + +def parse_delimiter(arg: str) -> str: + try: + retval = eval('"{}"'.format(arg.replace('"', r'\"')), {}) + except SyntaxError: + retval = None + if isinstance(retval, str): + return retval + else: + raise ValueError(f"not a valid string: {arg!r}") + +def parse_arguments(arglist: Optional[Sequence[str]]=None) -> argparse.Namespace: + parser = argparse.ArgumentParser(prog=PROGNAME) + cliutil.add_version_argument(parser) + cliutil.add_loglevel_argument(parser) + parser.add_argument( + '--delimiter', '-d', + metavar='STR', + type=parse_delimiter, + default='\\n', + help="""String to output between links. Accepts all backslash escapes +supported in Python like \\n, \\t, \\0, \\u, etc. Default '\\n'. +""") + parser.add_argument( + '--zero', '--null', '-z', '-0', + action='store_const', + dest='delimiter', + const='\0', + help="""Shortcut for --delimiter=\\0 +""") + parser.add_argument( + 'odf_paths', + metavar='ODF_PATH', + type=Path, + nargs=argparse.ONE_OR_MORE, + help="""ODF file(s) to extract links from. Note that %(prog)s cannot +read from stdin because it needs to know document paths to resolve links. +""") + return parser.parse_args(arglist) + +def extract_links(odf_path: Path) -> Iterator[Path]: + odf_root = odf_path.parent.resolve() + with odf_path.open('rb') as odf_file: + odf_doc = odf.opendocument.load(odf_file) + for a_elem in odf_doc.getElementsByType(odf.text.A): + parts = urllib.parse.urlparse(a_elem.getAttribute('href') or '') + if (parts.scheme and parts.scheme != 'file') or not parts.path: + continue + path = Path(urllib.parse.unquote(parts.path)) + if not path.is_absolute(): + path = (odf_path / path).resolve() + try: + path.relative_to(odf_root) + except ValueError: + logger.warning(f"link {path} is neither absolute nor relative to {odf_path}") + yield path + +def main(arglist: Optional[Sequence[str]]=None, + stdout: TextIO=sys.stdout, + stderr: TextIO=sys.stderr, +) -> int: + args = parse_arguments(arglist) + cliutil.set_loglevel(logger, args.loglevel) + + returncode = 0 + links: Set[Path] = set() + for odf_path in args.odf_paths: + try: + links.update(extract_links(odf_path)) + except IOError as error: + logger.error("error reading %s: %s", odf_path, error.strerror) + returncode = os.EX_DATAERR + except BadZipFile as error: + logger.error("error parsing %s: %s", odf_path, error.args[0]) + returncode = os.EX_DATAERR + + for link in links: + if not link.exists(): + logger.warning("path %s not found", link) + print(link, end=args.delimiter, file=stdout) + return returncode + +entry_point = cliutil.make_entry_point(__name__, PROGNAME) + +if __name__ == '__main__': + exit(entry_point()) diff --git a/setup.py b/setup.py index e4743cc..02e5580 100755 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ from setuptools import setup setup( name='conservancy_beancount', description="Plugin, library, and reports for reading Conservancy's books", - version='1.6.4', + version='1.7.0', author='Software Freedom Conservancy', author_email='info@sfconservancy.org', license='GNU AGPLv3+', @@ -37,6 +37,7 @@ setup( entry_points={ 'console_scripts': [ 'accrual-report = conservancy_beancount.reports.accrual:entry_point', + 'extract-odf-links = conservancy_beancount.tools.extract_odf_links:entry_point', 'fund-report = conservancy_beancount.reports.fund:entry_point', 'ledger-report = conservancy_beancount.reports.ledger:entry_point', 'opening-balances = conservancy_beancount.tools.opening_balances:entry_point', diff --git a/tests/repository/LinksReport.ods b/tests/repository/LinksReport.ods new file mode 100644 index 0000000000000000000000000000000000000000..79d257cd620b5e20850286fa9088b324b460b175 GIT binary patch literal 9166 zcmdUVbzGF~+U8o{6>Ynz^p)S!-k^pFG6`0N??D3`cl5UrWwFdH?|M<9K`tU}a`yWba~a zq-SkyX=b2jZw3M~I06mnL3(y(cJv@?BcLJ3z`@E0XislvZEK`wXlH6W59~N6#20&YQ>1 znkUcOCC|Af&zhwy1Jf4m(ihw^7Ys92EHYPsdFwXWEA|Cz&bceDMe7b_+a6_`?iJhK z*$bcY7Xxz^z7#A*6fMS=E_oNP1{E*+l&^oR-uA2C4XRlG@@*@qVK=g5IjV9svUW4F zaVN22CAoehxoJDSW;L^ZJ-unWsBx{Vc|D|MKcf90sckQ*_b|5mII;IQvtu`-_n@$4 zv!HXQxNkpg;3R+OxNz*WqHUwPd%L`6x3+V$zGtgxV7F@YsDA9IeBz{X=DdFLq<;Rq zVezVY<+5q{s%>zmXK1HmY;Rz6cX0Z!b?Ufd?zDgYbaZlWa`td;{%~aJYzvHprE2V($rHWcYT5zcrD8LM9Xw%G++5gvhzoUsUYEktl;f zZgigszDaq=*cQ3E9%_m*oX4W&R^#a-6WbAPEU|_3wC_EryXGt^})I^ zIm+KVUn`(bi6Z^p#R-r<-SYnelSC=u3aG~9__Epc&|Ox{cL=3v<3382Gb zMRJnbEN^U$qn6`Aafofa#c;QNou&(_g>$hy*L74fY_T+}>5CMl@eDZ-9WQ5dMb#Sp z-2}J9#%Ez|>JwJH#TV$2C*qn+ck)b@cXjyHh~1O+@5ar2DYqm<9KR>~I6uF8kOv)} zP``2r;RgW^tEil?z@bsL?o*hEJG_beR*Bk^0fd7%^8G3t0m#~YY!Qx{1((O?PR&T6 zl+P&fM^Oj&-05ZIExrKv-vwWZG86UFZFYxh798bLmtf`HzHiz!9Z*n!b&{4Zx~4h@ z$9$DKMIU_~yDP*!(IBO-v!sx3R@iW^=TohG7v;)s#HrO=T^y!y9yMa0s6RtWZmOVT z$9y-FK$v2JRJp`>uA2D5!jljU15zUMaNRLlD-RhXAiXY$3Ef@B{1P;4A)wFgV(X3V;558+h&(xOUcLqjq7Cf2cYPk7j^K$%(hxS`tnL($NF?~-` z`X%~b#XI%niS{lftzQr7s8_>$7zwpn3OgT{a9(eGAJm=vNj^E-Sq-74aMb7AEqtKM zS9^I_Q^CPENhjmgDQRSLnj!Q1oRQM=E%zK*)f);*_Q@oP#iU_Hh0&dv*wgd`tu@FT z(pZ<6jxq@~+2yBwRTfu5Vo#Z&p)>#9or~fw>ypO7@6}dZ>q)1rvqlZAhU^Z#>^GS; zlk41!i|=zA`l|`}FgBX+gXUUef*fSDFI)xoKLU;%P+4Rgdq6l-6) z_y8gY3yT_Bhj(07_59Lxr^-gHLLmao>z#QKE3%fM0=MI|&gHCOX*aen4-4>@J8 z!|DB^z1^6LXjM#Q$Wfc}dEMx;FH;R9w_e*f?Dw_Cl^IL9{@h2&OHk~DgAdX7Wq!j9 z-Znpc$a_gn7e<-m<6D}tZ%m_B%tXUVi|KpqCY8}ntR4qOd|BON*#Q!87!%`Q{M5pQ z31}3F9kiG)!USgYZ}E0s&q$wQ2d^sYW1(TVZm0Vb{(pS;(0>ZIeFvRG_Yt?@F*XC< ziAW2771Z_qH6QC8$+c7iP+3VRm*y-!pGW>TqYml{r^saf9Oxoz`)4T=&^gyf9mm1NNZb=iLH^{|7EN{YL4IM_+yy={S56vAWLgKppoT& znBkv#`+@t(@Wh0;iA1`L>s8CZSq98V7P%j&@g z3txL0QN%k%I4|3ESKVa0NLMz@4lNkb zl5Z^3M7^oMW<+46krIeyc$ZHGYT6rK7N|rmZe>o=O;1jhjvGXyROj}`tT|Qds&I!} zax2g1GUa`BrRkzMQk0hWl4GFBxkd}p8tRX&O*PXJi2~Nl>2f1|(i}bB-1Jj76MJ)L zJ*mEl9o>Ox%a}-w2TNjA24l(|liFGr)#p(5l#`3~eZ{bJRYV%9_hCW@zDdvI&c=6R zHm-X1LNCT{+{nzc0q>i9X34JHzSlMmNI4Z=XN@^Wa;8K_M9*BvgUZ}+Eg2Nc)+bFl zEGi2bK5ggW`9jJy>c)H5Bc_@Oz!$IGX=`D3TGlF#<2Gzw*vKUz;;jk>!b87|JY67X z_L6;qsGuIiT`B=9^>A~)LOqnG3&;4t9Cs{lrc6jmHb{c(>MBC6D7Z_{xO24{l*@MT z&8V)&U(!FkdL;UNM5JsMOQx#mF?uA5|A9zJb;_{+OLGArFK;2LbY82y3VP^`?CnOu zo=m=K>^m*kUEP+i(7nL{8_IOMYSt`oUM%UB@0f-7YNv07)zmcG?PTGSh^I5+GVX|A zan9Z~kpwNgv7;Cji6GSAsRNhEvv}-UrV{0cqm8qdF|l%UuC-0*R8|ORAz>b1%O|0U z_vsTObkwTPPq8&=TRjX*O=V@s7jF zD9h>-mulwGsTBCUml0wRq)C5w|+^lRKW{8S^-N#Bcs|1FZF+X zmTXutG^MULRW;>>%aV{*zCMW@{jvgRz~vv*8+GPQ5tFxLs~FA$ccPfL7&@_vTvyO=cRv&l%8 zHl^qakM`U$V+#1B`gbJgl~Ixzv8f8#J_rl&F{Cp@Gz3HOujOz*YR^w;%$W#fZr&Pp zY@0S34~F|Z@HSE`$-F(NGD|8nc&p!Ix2iw8Jklh^lZpW=^_A!#_@b_D=#&k$u>>{0 zvZM4`dyYh=XCy3KaZqP-1{6#c88$U2tjFO+KwHoNT$UfdfqdC!3DtQEXWaH}t(PZh zh@JubwUR;QgF*gG@XIInt0yyVF{;z5mVFO&#T;6!4ACx}r+cOdLZ{6Eas)3S$`;rr zInfFjSU2Cyl}WCRxLml_^@!2gaH@RLzoS+nxmD!kHr^?i5c38(qcr8<*Q8BI-vnBa zT!I;>`2@e%DJwOjvN*%Jap$GN_snItt!A{=gEptXqs9H2Q@y|G+FdV=Fd^`wVSVTJ^JdNelGRkRt$0_<=s)`2oC?WycPY_sOdu# z3(+Mk^z2LA?G{hz7a%^Uf2T6|zbU~3g^cYKs*=ot;cHZ*l2zGwHACU-s|17&1` zH>j$DEdy5oL3Jmpx*L!8h%e_Qm}*h!a6?3Lxv%%|DSH+rycBwI^~Sh-Fc+xSLVdw@ zE-7t8z}5-2UiFG@usiKtEsmAv)V3SENr=tTrCO@V5E0V?u*O+aOP0K3zV^&GE2=|R zSKO3okV<Xw*fRQh7js14^9yp!ABP<$izRc^H(!>2dnFXtw3E>x!kyhjSz=p`vM6 zx=6X_gRO6Av`mD8aT0_Px?ZbuS{yPyrK1s0RlFL!TXuW!zC!tR`+S~@;M&0g0MO(B zzz?~=&fdk+$nM7mE~s099y8T_la5|qODR<3`6_#G<8*8F*SU}EIFNFW*EoR@welHn zM;*s>n#rhpqt4|!^CN-f;lk^7-Su0CSEL)OrxZq!bZ9UeGh+xjO)Mn1n`na(*=P3) zl9%(fnjUtoYjOJJ0dE$wUPS^)K)BQt#7S64kynr@H2&_pwG*+NTXNVi#N;xb>>0+Z_g3pmdyMQM%FKt80@lw%KARSe` z!JJ~Jp3C|!{3QqW-9(@{y*L!AwaUiE8DTgMK8e zawCOT-+51XFSUho?@u6;d?mtMj<0>KWPM=<#p=yQs(H2=*p(3C^uKTVH9btQTw=R& zz%L=td@fyfMh!u0XItYrAVsnEf`gB>RO(YPO;0+UydA)JwIw*^*Re*)txH)X=|HD2 zrBhH)<4r*t$d#@;G~iS2n_*KEwV%SeS*~}#l>$3+_C4+GvKyj8EMTNQ)lx*B0gh?O zFr1oO2)4HOb6Nos6J52bX(@}}37KMyML>2sQ<;2Qfq9|Uu3~#_A~yXXaj$bzY1YeV z9ZZ%PtUxCa&*WBt63phHExrX4O0rUwZ5O`sAdcRu=+g>)2R-A4Iv4%J=y1s`EZ-dk zZhn^_W^wtr7#dTFnS-%w?2U%26MzX`@;jlbwt9r#77MBreX*bnub%u4#zxEVG%k(? zy>aa@&k>zOy^7ZZIFUf4cVjoxV0qLrgRn^l9i)aP@t6Z+Nkuiha2zYuTDbz&SkWC) zdnUbs{N)=84u8d0g7VYaN(GEkX!eXam8x7TDWUiXQSebEbMk3uuv@h+M+Gz99}F9t z)Xw)86j=qC9l4e>ZX)@W8~UJclg9&FWOL;|DjY3w`O|2|5z0BnvEV7}e3`Gz)UYT9N8QXk1D>|DQs zstuq}6l{U!mGvK@!90M{#&Ju!A2@;ddQyrXEWv%Ebbw4Gy3}x5>%?q_u{=wxr)+An z$KIaC1Y=OarwT?K%neNO`B=f~-)s`3_u_3%SPYTUBBqjLgZG5CJ>h9y5Z9;E0~cWR z(eXFjI_E~4;I^EO8!h>}Zmg$oj>8+C%WU6qmu_k!@X9i%Xhvk(TV(p>S|8uwt%QD- zy}_*^W%NT@^T3pAV~JZSiKq2lr64QjLs)P~9qSXx8dALnb1KefEq@V>q}*zPfMrgV zQB5G`Fr)l>s5-Xv6%0a-%{t^T2Dcfd2w7K1QJZwrde3Gl?cY!0lQ<#9Wn+xV%!$){ z8Hyw92@%~L_{<)0NeubI71LH@ZMlS*TX4MeCNyMBE5YG?utAZ9`+I80{ft_-yKA3( z1L;fZ3+hwZtSF2n9Qn;NILZwZ@UmLQHqJZvuH?c)YJ}^TpZBMe*GZ^#q79l6TRYo_ z;b(`L%0kW^7$gp>Ywi&Yj=?q|$dpbN%N@D^F+v=Pv5;Qhh|h3+zV$KPDb*>NqVMKx zD1+Tb9R2SOGRN3oW?5Z~REj{%xmO8z>S;ieO$;It`Zc?GUnByM7LS81ukN^Hp5#86 z_|T9=xZUo(n!^LlNulY|_+w67XR`q^>ePM+&jIlT}T?D%mJvQ?lkwAPo7 zkrj@YzFtI?+IG~FH{M$cp1_oFmph8+Y98fkIm1Ppn{g8X@`*CTn_Yzp0}@lj_f_cG8#|uFkxkM~e5sS#N0v&CS$TO%6LzXI zd8{bBHdT9R1`#ZqCG=o^%=L{UeW?I}5_$#(hKBa8a&ms+72;Exz`;&~osIyk;P1qN ziLPM&Xoiho<~bgHZJ0@CWsUw%5=uT`0gMLh(A3Nesq0hwk29s6MYR~O(&r5?w0hZ5 z7I(SNVzVMv&NoUmew|uJHMGAZC}zCUe9AQT2c0il+;5S4?XqN z?~r~?O_i6XV=#}Eh0kMUf&7@7tn`3p#zuDb^tOh^gOP(kA4b$qM?AhpL(^{=U|?*i z`fANnTiE9IRR-nbz*$!IFK!eD>A5@6V7;K8c16w-`I#FcU6YK zBM=4)nw_t^<8cnHA{eX4+3%%=xm-RLXA4~UY2}D|)$)!0)x_amZ-Wuu_kFVW!LO5+ z>|}{KljCenKUnS7Z5;Q>;L$y%4H=$Z3_#A=&|YpVJ=FABa*sB3Jv4B281Fn;B8$>Z zVm_4Ft><=m*t(}5e!yKVsdN;!d^?nB0twryJS9>REBSS~5@s7YxnTf+G>U&Mmz|Nl zy&2HtuUvkhu3O}ysbYG`xAUfN!q(>;}HA&&HK z*$w7Q8iNi0Srxuf}1w?3M-wG%V5!>Vi^_(ilORI0CMh#E$1Gm@6 zY;`-crI!Urhd40>pRJhKa;-#X-6$jA6ds`oo5`bdza-NhJwzGI*`|Co}_Rozu2+7qv1+%Yz5E@yV#IgRI>_MiCK`=ONSs zKH3XlT0H}zpZ_IQ3Rz>2Rj3?}pO%x1Bz>GcARDh%=sK4*`0XV)31AjRL=AE5bB0N$ zB3ki0|7g?a7ez+0;Azt{{bGjwl)rx196_m~$m2M*Ow8emkY&4gCzUCpY9D@g+ z*()hyuUd$>&csGv0yB^LNzNu45P%?C6VSKcs}jX(!r4ri`CSfsHoXAssI*P zIGZe{bZ)PgHVmOyp>ck^-<}S!OHGlr@Q6%%bsc@rIl_KQm+Bohx-N$K$(?!Ejn}yZ z?&y5Ek?{6mS3uqAJxxgPS{yLzIR2ALQqmT_C{s0*qpCu2D;D@}1srW?Zr_w@Rf_T` z-6gd=r%j!7KzBCqxpJlrCH;a9PCCqPzU186NVq%(JnKBUym}|p@^)Q{-!2`bTYH$! zL-)BkPYbTY>)XdMm?`U+TLT-J-MCXy!Rr1aVlr55P4=R})QzoUr zb3l+X(5*m2JBWK0Jd{#X9pm?;Z~-!>g{;WkW|pT{qIdj(7des>GG#t2SkpLhqa*4h zZPHTg>ubd+y$d#Up>X8A?M&D=>{w%CBwAkS_?B30J{X=Od1oAZqAq&Bt#`PZ?^ja% zd8r25xRRnh&mlOx;CasAlzn-ufLkhY+)mR zL;glG{szrmKLI;33fuJqlZGs-&rNWztPS!BInckw|Ml#mj{{jO-o4~E*|iO;FJ(pR zzKm(mB~S?|C=1x+q*ymk8-tAMe3|{7q_4}vAh{xiI7~W^AXM?meXtEo%mG9tw z4YF9AP{k~dIqCJI^7knG0i;9}1?b*Mi!%J5-s$`xpx>l9cnpv#}#$^c^o?_Lse^Q?BiW{ zTov%o0PAPLh{ayQw(5!-ggJB7dYlQhn;`rssQ8&o*QB#0|#svJcdH%T6e{27?(tk(&Zjb+j zC_bM4lUe>d%TE*l@I%!9+14Lz_uu93zf1hT&fzEScOLvlq2za-zf1oA%=1IK|JfA( z!1JpG@c+sb@&~40ErEY#`XST*Y(0Nq`e_mTGs~|leg6lRzgq_X%=7DL*b)BcO8>MF z{+Z>6iSV;o{(5=6Z%K_$J zmcws$!_V*bpVhjbbKH9NSYiKf?e2Hf?;_DpR1WIjC8OVQzgO@-aRM)Xt>=G7{$9-f tL|Q+t!rx2V-;uvZ=%2{R$A7*2F<#3`!aOQq007$K&Eqk!bN{vb{|k$pI. + +import io + +import pytest + +from . import testutil + +from conservancy_beancount.tools import extract_odf_links + +SRC_PATH = testutil.test_path('repository/LinksReport.ods') + +EXPECTED_FILE_LINKS = { + '/repository/Projects/project-data.yml', + str(testutil.test_path('repository/Projects/project-data.yml')), + str(testutil.test_path('repository/Projects/Bad Link.txt')), +} + +@pytest.mark.parametrize('arglist,sep', [ + (['-0'], '\0'), + (['-d', '\\v'], '\v'), + ([str(SRC_PATH)], '\n'), # Test that links aren't duplicated +]) +def test_extract_file_links(arglist, sep, caplog): + arglist.append(str(SRC_PATH)) + stdout = io.StringIO() + stderr = io.StringIO() + exitcode = extract_odf_links.main(arglist, stdout, stderr) + assert exitcode == 0 + assert not stderr.getvalue() + actual = stdout.getvalue().split(sep) + if actual and not actual[-1]: + actual.pop() + assert len(actual) == len(EXPECTED_FILE_LINKS) + assert set(actual) == EXPECTED_FILE_LINKS + assert caplog.records + assert any( + log.levelname == 'WARNING' + and log.message.endswith('/Bad Link.txt not found') + for log in caplog.records + )