Source code for bmp.pyterrier

import numpy as np
import pandas as pd
from pathlib import Path
import pyterrier as pt
import pyterrier_alpha as pta
from bmp import Indexer, Searcher


[docs]
class BmpIndex(pta.Artifact, pt.Indexer):
    """ Represents a Block-Max Pruning Index stored on disk.

    .. cite.dblp:: conf/sigir/MalliaST24
    """
    ARTIFACT_TYPE = 'sparse_index'
    ARTIFACT_FORMAT = 'bmp'

    def __init__(self, path: str):
        """
        Args:
            path (str): Path to the index directory.
        """
        super().__init__(path)
        self._searcher = None


[docs]
    def built(self) -> bool:
        """ Checks whether the index has been built.

        Returns:
            bool: True if the index exists on disk, False otherwise.
        """
        return Path(self.path).exists()



[docs]
    def indexer(
        self,
        *,
        bsize: int = 32,
        compress_range: bool = False,
        scale_float: float = 100.,
    ) -> pt.Indexer:
        """ Creates a :class:`bmp.pyterrier.BmpIndexer` for indexing documents.

        Args:
            bsize (int): Block size for block-max pruning.
            compress_range (bool): Whether to compress the index.
            scale_float (float): Scaling factor for float token values into integers.

        Returns:
            BmpIndexer: The indexer instance.
        """
        return BmpIndexer(self, bsize=bsize, compress_range=compress_range, scale_float=scale_float)



[docs]
    def index(self, inp: pt.model.IterDict) -> pt.Artifact:
        """ Index the documents with default settings.

        Args:
            inp: An iterable of documents (dicts containing ``docno`` and ``toks`` keys) to index.
        """
        return self.indexer().index(inp)



[docs]
    def retriever(
        self,
        *,
        num_results: int = 1000,
        alpha: float = 1.0,
        beta: float = 1.0,
    ) -> pt.Transformer:
        """ Creates a :class:`bmp.pyterrier.BmpRetriever` for this index.

        Args:
            num_results: the number of results per query to retrieve.
            alpha: block termination threshold (terminate retrievel when the maximum block score is less than ``alpha`` of the threshold. Decreasing this value increases the chance documents are missed, but speeds up retrieval by pruning more blocks. For exact retrieval, use ``alpha=1.0`` (default).
            beta: query term pruning factor (keeps the top ``beta`` weight of query terms). Decreasing this value introduces score approximation error, but reduces computational cost. For exact scoring, use ``beta=1.0`` (default).

        Returns:
            The retriever instance.
        """
        return BmpRetriever(self, num_results=num_results, alpha=alpha, beta=beta)



[docs]
    def transform(self, inp: pd.DataFrame) -> pd.DataFrame:
        """ Retrieve documents from the index for the given queries using default settings (exact retrieval),

        Args:
            inp: A DataFrame containing queries with a ``query_toks`` column.

        Returns:
            DataFrame containing retrieved documents with ``docno``, ``score``, and ``rank`` columns.
        """
        return self.retriever()(inp)



[docs]
    def load_into_memory(self):
        """ Loads the index into memory and returns a Searcher instance.

        If the searcher is already loaded, it returns the existing instance.

        Returns:
            Searcher: The in-memory searcher instance.
        """
        if self._searcher is None:
            self._searcher = Searcher(str(self.path/'index.bmp'))
        return self._searcher



[docs]
    def close(self):
        """ Closes the in-memory searcher if it exists. """
        self._searcher = None


    def __enter__(self):
        self.load_into_memory()
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()




[docs]
class BmpIndexer(pt.Indexer):
    """ An indexer for a BMP index. """
    def __init__(
        self,
        bmp_index: BmpIndex,
        bsize: int = 32,
        compress_range: bool = False,
        scale_float: float = 100.
    ):
        """
        Args:
            bmp_index: BMP index object to create.
            bsize: Block size for block-max pruning.
            compress_range: Whether to compress the index.
            scale_float: Scaling factor for float token values into integers.

        Returns:
            BmpIndexer: The indexer instance.
        """
        self.bmp_index = bmp_index
        self.bsize = bsize
        self.compress_range = compress_range
        self.scale_float = scale_float


[docs]
    def index(self, inp: pt.model.IterDict) -> pt.Artifact:
        """ Index the documents with default settings.

        Args:
            inp: An iterable of documents (dicts containing ``docno`` and ``toks`` keys) to index.
        """
        assert not self.bmp_index.built()
        with pta.ArtifactBuilder(self.bmp_index) as builder:
            indexer = Indexer(str(self.bmp_index.path/'index.bmp'), bsize=self.bsize, compress_range=self.compress_range)
            count = 0
            for doc in inp:
                vector = doc['toks']
                if len(vector) > 0 and isinstance(next(iter(vector.values())), float):
                    vector = {k: int(v * self.scale_float) for k, v in vector.items()}
                indexer.add_document(doc['docno'], vector)
                count += 1
            indexer.finish()
            builder.metadata['bsize'] = self.bsize
            builder.metadata['compress_range'] = self.compress_range
            builder.metadata['scale_float'] = self.scale_float
            builder.metadata['num_docs'] = count
        return self.bmp_index





[docs]
class BmpRetriever(pt.Transformer):
    """ A transformer that retrieves over a BMP index. """
    def __init__(
        self,
        bmp_index: BmpIndex,
        *,
        num_results: int = 1000,
        alpha: float = 1.0,
        beta: float = 1.0
    ):
        """
        Args:
            bmp_index: BMP index object to retrieve over.
            num_results: the number of results per query to retrieve.
            alpha: block termination threshold (terminate retrievel when the maximum block score is less than ``alpha`` of the threshold. Decreasing this value increases the chance documents are missed, but speeds up retrieval by pruning more blocks. For exact retrieval, use ``alpha=1.0`` (default).
            beta: query term pruning factor (keeps the top ``beta`` weight of query terms). Decreasing this value introduces score approximation error, but reduces computational cost. For exact scoring, use ``beta=1.0`` (default).
        """
        self.bmp_index = bmp_index
        self.num_results = num_results
        self.alpha = alpha
        self.beta = beta


[docs]
    def transform(self, inp: pd.DataFrame) -> pd.DataFrame:
        """ Retrieve documents from the index for the given queries.

        Args:
            inp: A DataFrame containing queries with a ``query_toks`` column.

        Returns:
            DataFrame containing retrieved documents with ``docno``, ``score``, and ``rank`` columns.
        """
        pta.validate.query_frame(inp, extra_columns=['query_toks'])
        searcher = self.bmp_index.load_into_memory()
        res = pta.DataFrameBuilder(['docno', 'score', 'rank'])
        for toks in inp['query_toks']:
            docnos, scores = searcher.search(toks, k=self.num_results, alpha=self.alpha, beta=self.beta)
            res.extend({
                'docno': docnos,
                'score': scores,
                'rank': np.arange(len(scores))
            })
        return res.to_df(inp)


    def fuse_rank_cutoff(self, k):
        if self.num_results > k:
            return BmpRetriever(self.bmp_index, num_results=k, alpha=self.alpha, beta=self.beta)