rtutil: Add RTLinkCache class to cache links to disk.

This will greatly reduce RT requests across multiple runs
and speed up link checking/conversion.
This commit is contained in:
Brett Smith 2020-03-27 07:35:45 -04:00
parent f227593655
commit a8407c7b6a
4 changed files with 236 additions and 6 deletions

View file

@ -126,8 +126,16 @@ class Config:
wrapper_client = self.rt_client(credentials, client) wrapper_client = self.rt_client(credentials, client)
if wrapper_client is None: if wrapper_client is None:
return None return None
cache_dir_path = self.cache_dir_path()
if cache_dir_path is None:
cache_db = None
else: else:
return rtutil.RT(wrapper_client) cache_name = '{}@{}.sqlite3'.format(
credentials.user,
urlparse.quote(str(credentials.server), ''),
)
cache_db = rtutil.RTLinkCache.setup(cache_dir_path / cache_name)
return rtutil.RT(wrapper_client, cache_db)
def rt_wrapper(self, def rt_wrapper(self,
credentials: RTCredentials=None, credentials: RTCredentials=None,

View file

@ -17,17 +17,142 @@
import functools import functools
import mimetypes import mimetypes
import re import re
import sqlite3
import urllib.parse as urlparse import urllib.parse as urlparse
import rt import rt
from pathlib import Path
from typing import ( from typing import (
Callable,
Iterator,
MutableMapping,
Optional, Optional,
Set,
Tuple, Tuple,
Union, Union,
) )
RTId = Union[int, str] RTId = Union[int, str]
TicketAttachmentIds = Tuple[str, Optional[str]]
_LinkCache = MutableMapping[TicketAttachmentIds, Optional[str]]
class RTLinkCache(_LinkCache):
"""Cache RT links to disk
This class provides a dict-like interface to a cache of RT links.
Once an object is in RT, a link to it should never change.
The only exception is when objects get shredded, and those objects
shouldn't be referenced in books anyway.
This implementation is backed by a sqlite database. You can call::
db = RTLinkCache.setup(path)
This method will try to open a sqlite database at the given path,
and set up necessary tables, etc.
If it succeeds, it returns a database connection you can use to
initialize the cache.
If it fails, it returns None, and the caller should use some other
dict-like object (like a normal dict) for caching.
You can give the result to the RT utility class either way,
and it will do the right thing for itself::
rt = RT(rt_client, db)
"""
CREATE_TABLE_SQL = """CREATE TABLE IF NOT EXISTS RTLinkCache(
ticket_id TEXT NOT NULL,
attachment_id TEXT,
url TEXT NOT NULL,
PRIMARY KEY (ticket_id, attachment_id)
)"""
@classmethod
def setup(cls, cache_path: Path) -> Optional[sqlite3.Connection]:
try:
db = sqlite3.connect(cache_path, isolation_level=None)
cursor = db.cursor()
cursor.execute(cls.CREATE_TABLE_SQL)
cursor.execute('SELECT url FROM RTLinkCache LIMIT 1')
have_data = cursor.fetchone() is not None
except sqlite3.OperationalError:
# If we couldn't get this far, sqlite provides no benefit.
return None
try:
# There shouldn't be any records where url is NULL, so running this
# DELETE pulls double duty for us: it tells us whether or not we
# can write to the database and it enforces database integrity.
cursor.execute('DELETE FROM RTLinkCache WHERE url IS NULL')
except sqlite3.OperationalError:
can_write = False
else:
can_write = True
print(cache_path, have_data, can_write)
if not (can_write or have_data):
# If there's nothing to read and no way to write, sqlite provides
# no benefit.
return None
elif not can_write:
# Set up an in-memory database that we can write to, seeded with
# the data available to read.
try:
cursor.close()
db.close()
db = sqlite3.connect(':memory:', isolation_level=None)
cursor = db.cursor()
cursor.execute('ATTACH DATABASE ? AS readsource',
('{}?mode=ro'.format(cache_path.as_uri()),))
cursor.execute(cls.CREATE_TABLE_SQL)
cursor.execute('INSERT INTO RTLinkCache SELECT * FROM readsource.RTLinkCache')
cursor.execute('DETACH DATABASE readsource')
except sqlite3.OperationalError as error:
# We're back to the case of having nothing to read and no way
# to write.
return None
cursor.close()
db.commit()
return db
def __init__(self, cache_db: sqlite3.Connection) -> None:
self._db = cache_db
self._nourls: Set[TicketAttachmentIds] = set()
def __iter__(self) -> Iterator[TicketAttachmentIds]:
yield from self._db.execute('SELECT ticket_id, attachment_id FROM RTLinkCache')
yield from self._nourls
def __len__(self) -> int:
cursor = self._db.execute('SELECT COUNT(*) FROM RTLinkCache')
return cursor.fetchone()[0] + len(self._nourls)
def __getitem__(self, key: TicketAttachmentIds) -> Optional[str]:
if key in self._nourls:
return None
cursor = self._db.execute(
'SELECT url FROM RTLinkCache WHERE ticket_id = ? AND attachment_id IS ?',
key,
)
retval = cursor.fetchone()
if retval is None:
raise KeyError(key)
else:
return retval[0]
def __setitem__(self, key: TicketAttachmentIds, value: Optional[str]) -> None:
if value is None:
self._nourls.add(key)
else:
ticket_id, attachment_id = key
self._db.execute(
'INSERT INTO RTLinkCache VALUES(?, ?, ?)',
(ticket_id, attachment_id, value),
)
def __delitem__(self, key: TicketAttachmentIds) -> None:
raise NotImplementedError("RTLinkCache.__delitem__")
class RT: class RT:
"""RT utility wrapper class """RT utility wrapper class
@ -38,7 +163,9 @@ class RT:
* Parse links * Parse links
* Verify that they refer to extant objects in RT * Verify that they refer to extant objects in RT
* Convert metadata links to RT web links * Convert metadata links to RT web links
* Cache results, to reduce network requests * Cache results, to reduce network requests.
You can set up an RTLinkCache to cache links to disks over multiple runs.
Refer to RTLinkCache's docstring for details and instructions.
""" """
PARSE_REGEXPS = [ PARSE_REGEXPS = [
@ -46,8 +173,7 @@ class RT:
re.compile(r'^rt://ticket/([0-9]+)(?:/attachments?/([0-9]+))?/?$'), re.compile(r'^rt://ticket/([0-9]+)(?:/attachments?/([0-9]+))?/?$'),
] ]
def __init__(self, rt_client: rt.Rt) -> None: def __init__(self, rt_client: rt.Rt, cache_db: Optional[sqlite3.Connection]=None) -> None:
self.rt = rt_client
urlparts = urlparse.urlparse(rt_client.url) urlparts = urlparse.urlparse(rt_client.url)
try: try:
index = urlparts.path.rindex('/REST/') index = urlparts.path.rindex('/REST/')
@ -56,6 +182,32 @@ class RT:
else: else:
base_path = urlparts.path[:index + 1] base_path = urlparts.path[:index + 1]
self.url_base = urlparts._replace(path=base_path) self.url_base = urlparts._replace(path=base_path)
self.rt = rt_client
self._cache: _LinkCache
if cache_db is None:
self._cache = {}
else:
self._cache = RTLinkCache(cache_db)
# mypy complains that the first argument isn't self, but this isn't meant
# to be a method, it's just an internal decrator.
def _cache_method(func: Callable) -> Callable: # type:ignore[misc]
@functools.wraps(func)
def caching_wrapper(self,
ticket_id: RTId,
attachment_id: Optional[RTId]=None,
) -> str:
cache_key = (str(ticket_id), attachment_id and str(attachment_id))
try:
url = self._cache[cache_key]
except KeyError:
if attachment_id is None:
url = func(self, ticket_id)
else:
url = func(self, ticket_id, attachment_id)
self._cache[cache_key] = url
return url
return caching_wrapper
def _extend_url(self, def _extend_url(self,
path_tail: str, path_tail: str,
@ -84,7 +236,7 @@ class RT:
fragment = 'txn-{}'.format(txn_id) fragment = 'txn-{}'.format(txn_id)
return self._extend_url('Ticket/Display.html', fragment, id=str(ticket_id)) return self._extend_url('Ticket/Display.html', fragment, id=str(ticket_id))
@functools.lru_cache() @_cache_method
def attachment_url(self, ticket_id: RTId, attachment_id: RTId) -> Optional[str]: def attachment_url(self, ticket_id: RTId, attachment_id: RTId) -> Optional[str]:
attachment = self.rt.get_attachment(ticket_id, attachment_id) attachment = self.rt.get_attachment(ticket_id, attachment_id)
if attachment is None: if attachment is None:
@ -118,7 +270,7 @@ class RT:
return (ticket_id, attachment_id) return (ticket_id, attachment_id)
return None return None
@functools.lru_cache() @_cache_method
def ticket_url(self, ticket_id: RTId) -> Optional[str]: def ticket_url(self, ticket_id: RTId) -> Optional[str]:
if self.rt.get_ticket(ticket_id) is None: if self.rt.get_ticket(ticket_id) is None:
return None return None

View file

@ -207,6 +207,22 @@ def test_rt_wrapper_cache_responds_to_external_credential_changes(rt_environ):
rt2 = config.rt_wrapper(None, testutil.RTClient) rt2 = config.rt_wrapper(None, testutil.RTClient)
assert rt1 is not rt2 assert rt1 is not rt2
def test_rt_wrapper_has_cache(tmp_path):
with update_environ(XDG_CACHE_DIR=tmp_path):
config = config_mod.Config()
rt = config.rt_wrapper(None, testutil.RTClient)
rt.exists(1)
expected = 'conservancy_beancount/{}@*.sqlite3'.format(RT_FILE_CREDS[1])
assert any(tmp_path.glob(expected))
def test_rt_wrapper_without_cache(tmp_path):
tmp_path.chmod(0)
with update_environ(XDG_CACHE_DIR=tmp_path):
config = config_mod.Config()
rt = config.rt_wrapper(None, testutil.RTClient)
tmp_path.chmod(0o600)
assert not any(tmp_path.iterdir())
def test_cache_mkdir(tmp_path): def test_cache_mkdir(tmp_path):
expected = tmp_path / 'TESTcache' expected = tmp_path / 'TESTcache'
with update_environ(XDG_CACHE_DIR=tmp_path): with update_environ(XDG_CACHE_DIR=tmp_path):

View file

@ -14,6 +14,8 @@
# You should have received a copy of the GNU Affero General Public License # You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>. # along with this program. If not, see <https://www.gnu.org/licenses/>.
import contextlib
import pytest import pytest
from . import testutil from . import testutil
@ -46,6 +48,14 @@ def new_client():
TICKET_DATA = testutil.RTClient.TICKET_DATA.copy() TICKET_DATA = testutil.RTClient.TICKET_DATA.copy()
return RTClient() return RTClient()
def new_cache(database=':memory:'):
db = rtutil.RTLinkCache.setup(database)
if db is None:
print("NOTE: did not set up database cache at {}".format(database))
return contextlib.nullcontext(db)
else:
return contextlib.closing(db)
@pytest.mark.parametrize('ticket_id,attachment_id,expected', EXPECTED_URLS) @pytest.mark.parametrize('ticket_id,attachment_id,expected', EXPECTED_URLS)
def test_url(rt, ticket_id, attachment_id, expected): def test_url(rt, ticket_id, attachment_id, expected):
if expected is not None: if expected is not None:
@ -125,3 +135,47 @@ def test_uncommon_server_url_parsing():
client = testutil.RTClient(url + 'REST/1.0/') client = testutil.RTClient(url + 'REST/1.0/')
rt = rtutil.RT(client) rt = rtutil.RT(client)
assert rt.url(1).startswith(url) assert rt.url(1).startswith(url)
def test_shared_cache(new_client):
ticket_id, _, expected = EXPECTED_URLS[0]
expected = DEFAULT_RT_URL + expected
with new_cache() as cachedb:
rt1 = rtutil.RT(new_client, cachedb)
assert rt1.url(ticket_id) == expected
new_client.TICKET_DATA.clear()
rt2 = rtutil.RT(new_client, cachedb)
assert rt2.url(ticket_id) == expected
assert not rt2.exists(ticket_id + 1)
assert rt1 is not rt2
def test_no_shared_cache(new_client):
with new_cache() as cache1, new_cache() as cache2:
rt1 = rtutil.RT(new_client, cache1)
rt2 = rtutil.RT(new_client, cache2)
assert rt1.exists(1)
new_client.TICKET_DATA.clear()
assert not rt2.exists(1)
assert rt1.exists(1)
def test_read_only_cache(new_client, tmp_path):
db_path = tmp_path / 'test.db'
ticket_id, _, expected = EXPECTED_URLS[0]
expected = DEFAULT_RT_URL + expected
with new_cache(db_path) as cache1:
rt1 = rtutil.RT(new_client, cache1)
assert rt1.url(ticket_id) == expected
new_client.TICKET_DATA.clear()
db_path.chmod(0o400)
with new_cache(db_path) as cache2:
rt2 = rtutil.RT(new_client, cache2)
assert rt2.url(ticket_id) == expected
assert rt2.url(ticket_id + 1) is None
def test_results_not_found_only_in_transient_cache(new_client):
with new_cache() as cache:
rt1 = rtutil.RT(new_client, cache)
rt2 = rtutil.RT(new_client, cache)
assert not rt1.exists(9)
new_client.TICKET_DATA['9'] = [('99', '(Unnamed)', 'text/plain', '0b')]
assert not rt1.exists(9)
assert rt2.exists(9)