Source code for pyterrier_caching.util
from pathlib import Path
from contextlib import contextmanager
import numpy as np
import pandas as pd
import pyterrier as pt
[docs]
@contextmanager
def closing_memmap(*args, **kwargs):
"""A context manager that creates a :class:`numpy.memmap` and closes it when the context is exited.
This allows :class:`numpy.memmap` to be used as a context manager, since it doesn't support the
context manager protocol directly.
Args:
*args: Positional arguments to pass to :class:`numpy.memmap`.
**kwargs: Keyword arguments to pass to :class:`numpy.memmap`.
Example:
.. code-block:: python
:caption: Using a :func:`~pyterrier_caching.util.closing_memmap` context manager.
from pyterrier_caching import closing_memmap
with closing_memmap('file.npy', dtype='float32', mode='w+', shape=(10, 10)) as mmp:
# do what you want with mmp here!
# mmp is closed here
"""
mmp = None
try:
mmp = np.memmap(*args, **kwargs)
yield mmp
finally:
if mmp is not None:
del mmp
mmp = None
[docs]
class Lazy(pt.Transformer):
"""A :class:`~pyterrier.Transformer` that doesn't initialize until it is used.
This is useful in cases where loading a transformer is lengthy or allocates resources that are not always
necessary. For instance a cached neural neural scorer allocates GPU memory, but often isn't needed when used
with a :class:`~pyterrier_caching.ScorerCache`.
Example:
.. code-block:: python
:caption: Using a :class:`~pyterrier_caching.Lazy` :class:`~pyterrier_dr.ElectraScorer` with a :class:`~pyterrier_caching.ScorerCache`.
from pyterrier_caching import ScorerCache
from pyterrier_dr import ElectraScorer
lazy_scorer = Lazy(ElectraScorer) # ElectraScorer not loaded yet
cached_scorer = ScorerCache('electra.cache', lazy_scorer)
cached_scorer([{
'qid': '0',
'query': 'terrier breeds',
'docno': 'doc1',
'text': 'There are many breeds of terriers, including the Scottish and Jack Russell Terrier.'
])
# ElectraScorer only loaded if ('0', 'doc1') is not yet in electra.cache
"""
def __init__(self, fn_transformer: pt.Transformer, *fn_args, **fn_kwargs):
"""
Args:
fn_transformer: A function that returns a transformer when called (or the transformer class itself).
fn_args: Positional arguments to pass to ``fn_transformer`` when loading it.
fn_kwargs: Keyword arguments to pass to ``fn_transformer`` when loading it.
"""
self.fn_transformer = fn_transformer
self.fn_args = fn_args
self.fn_kwargs = fn_kwargs
self._transformer = None
def transform(self, inp: pd.DataFrame) -> pd.DataFrame:
return self.load()(inp)
[docs]
def load(self) -> pt.Transformer:
"""Load the transformer if it isn't already loaded, and return it."""
if not self.loaded():
self._transformer = self.fn_transformer(*self.fn_args, **self.fn_kwargs)
return self._transformer
[docs]
def unload(self):
"""Unloads the transformer. Subsequent calls to :meth:`load` will re-load it."""
self._transformer = None
[docs]
def loaded(self) -> bool:
"""Return whether the transformer is currently loaded."""
return self._transformer is not None
def meta_file_compat(path):
"""
Until version 0.1.0, pt_meta.json was called meta.json. To ensure compatiblity between caches created with
version <0.1.0 and >=0.1.0, this method moves meta.json to pt_meta.json and linkns meta.json -> pt_meta.json.
The end effect is that caches created with version <0.1.0 will be compatible with >=0.1.0, but caches created
with >=0.1.0 will NOT be compatible with those created with <0.1.0.
"""
path = Path(path)
if (old_path := (path/'meta.json')).exists() and \
not (new_path := (path/'pt_meta.json')).exists():
old_path.rename(new_path)
old_path.symlink_to(new_path)