Source code for pyterrier_caching.util

from pathlib import Path
from contextlib import contextmanager
import numpy as np
import pandas as pd
import pyterrier as pt

[docs] @contextmanager def closing_memmap(*args, **kwargs): """A context manager that creates a :class:`numpy.memmap` and closes it when the context is exited. This allows :class:`numpy.memmap` to be used as a context manager, since it doesn't support the context manager protocol directly. Args: *args: Positional arguments to pass to :class:`numpy.memmap`. **kwargs: Keyword arguments to pass to :class:`numpy.memmap`. Example: .. code-block:: python :caption: Using a :func:`~pyterrier_caching.util.closing_memmap` context manager. from pyterrier_caching import closing_memmap with closing_memmap('file.npy', dtype='float32', mode='w+', shape=(10, 10)) as mmp: # do what you want with mmp here! # mmp is closed here """ mmp = None try: mmp = np.memmap(*args, **kwargs) yield mmp finally: if mmp is not None: del mmp mmp = None
[docs] class Lazy(pt.Transformer): """A :class:`~pyterrier.Transformer` that doesn't initialize until it is used. This is useful in cases where loading a transformer is lengthy or allocates resources that are not always necessary. For instance a cached neural neural scorer allocates GPU memory, but often isn't needed when used with a :class:`~pyterrier_caching.ScorerCache`. Example: .. code-block:: python :caption: Using a :class:`~pyterrier_caching.Lazy` :class:`~pyterrier_dr.ElectraScorer` with a :class:`~pyterrier_caching.ScorerCache`. from pyterrier_caching import ScorerCache from pyterrier_dr import ElectraScorer lazy_scorer = Lazy(ElectraScorer) # ElectraScorer not loaded yet cached_scorer = ScorerCache('electra.cache', lazy_scorer) cached_scorer([{ 'qid': '0', 'query': 'terrier breeds', 'docno': 'doc1', 'text': 'There are many breeds of terriers, including the Scottish and Jack Russell Terrier.' ]) # ElectraScorer only loaded if ('0', 'doc1') is not yet in electra.cache """ def __init__(self, fn_transformer: pt.Transformer, *fn_args, **fn_kwargs): """ Args: fn_transformer: A function that returns a transformer when called (or the transformer class itself). fn_args: Positional arguments to pass to ``fn_transformer`` when loading it. fn_kwargs: Keyword arguments to pass to ``fn_transformer`` when loading it. """ self.fn_transformer = fn_transformer self.fn_args = fn_args self.fn_kwargs = fn_kwargs self._transformer = None def transform(self, inp: pd.DataFrame) -> pd.DataFrame: return self.load()(inp)
[docs] def load(self) -> pt.Transformer: """Load the transformer if it isn't already loaded, and return it.""" if not self.loaded(): self._transformer = self.fn_transformer(*self.fn_args, **self.fn_kwargs) return self._transformer
[docs] def unload(self): """Unloads the transformer. Subsequent calls to :meth:`load` will re-load it.""" self._transformer = None
[docs] def loaded(self) -> bool: """Return whether the transformer is currently loaded.""" return self._transformer is not None
def meta_file_compat(path): """ Until version 0.1.0, pt_meta.json was called meta.json. To ensure compatiblity between caches created with version <0.1.0 and >=0.1.0, this method moves meta.json to pt_meta.json and linkns meta.json -> pt_meta.json. The end effect is that caches created with version <0.1.0 will be compatible with >=0.1.0, but caches created with >=0.1.0 will NOT be compatible with those created with <0.1.0. """ path = Path(path) if (old_path := (path/'meta.json')).exists() and \ not (new_path := (path/'pt_meta.json')).exists(): old_path.rename(new_path) old_path.symlink_to(new_path)