Source code for pyterrier_anserini._index

import os
from typing import Any, Dict, List, Literal, Optional, Union

import pyterrier as pt
import pyterrier_alpha as pta

import pyterrier_anserini
from pyterrier_anserini import J
from pyterrier_anserini._similarity import DEFAULT_WMODEL_ARGS, AnseriniSimilarity

_TFields = Union[List[str], str, Literal['*']]

[docs] @pt.java.required class AnseriniIndex(pta.Artifact): """An Anserini index. An Anserini index is a directory containing a Lucene index built with Anserini. This object can be used to construct retrieval transformers. """ def __init__(self, path: str): """Initializes a new Anserini index. Args: path: The path to the index. """ self.path = path
[docs] def built(self) -> bool: """Checks if this index is built. Returns: True if this index is built, False otherwise. """ return os.path.exists(self.path)
[docs] def indexer(self, *, fields: _TFields = '*', verbose: bool = False ) -> pt.Indexer: """Provides an indexer for this index. Args: fields: The fields to index. If '*' (default), all fields are indexed. Otherwise, the values of the fields provided in this argument are concatenated and indexed. verbose: Whether to display a progress bar when indexing. """ return pyterrier_anserini.AnseriniIndexer(self, fields=fields, verbose=verbose)
[docs] def retriever(self, similarity: Union[str, AnseriniSimilarity], similarity_args: Optional[Dict[str, Any]] = None, *, num_results: int = 1000, include_fields: Optional[_TFields] = None, verbose: bool = False ) -> pt.Transformer: """Provides a retriever that uses the specified similarity function. Args: similarity: The similarity function to use. similarity_args: The arguments to the similarity function. Defaults to None (no arguments). num_results: The number of results to return. Defaults to 1000. include_fields: A list of the fields to include in the results. If `None` (default), no extra fields are included. If '*', all fields are included. verbose: Output verbose logging. Defaults to False. Returns: A transformer that can be used to retrieve documents from this index. """ return pyterrier_anserini.AnseriniRetriever( index=self, similarity=similarity, similarity_args=similarity_args, num_results=num_results, include_fields=self._resolve_fields(include_fields), verbose=verbose)
[docs] def bm25(self, *, k1: float = DEFAULT_WMODEL_ARGS['bm25.k1'], b: float = DEFAULT_WMODEL_ARGS['bm25.b'], num_results: int = 1000, include_fields: Optional[_TFields] = None, verbose: bool = False ) -> pt.Transformer: """Providers a retriever that uses BM25 over this index. Args: k1: The BM25 k1 parameter. Defaults to 0.9. b: The BM25 b parameter. Defaults to 0.4. num_results: The number of results to return. Defaults to 1000. include_fields: A list of the fields to include in the results. If `None` (default), no extra fields are included. If '*', all fields are included. verbose: Output verbose logging. Defaults to False. Returns: A transformer that can be used to retrieve documents from this index using qld. """ return pyterrier_anserini.AnseriniRetriever( index=self, similarity=AnseriniSimilarity.bm25, similarity_args={'bm25.k1': k1, 'bm25.b': b}, num_results=num_results, include_fields=self._resolve_fields(include_fields), verbose=verbose)
[docs] def qld(self, *, mu: float = DEFAULT_WMODEL_ARGS['qld.mu'], num_results: int = 1000, include_fields: Optional[_TFields] = None, verbose: bool = False ) -> pt.Transformer: """Providers a retriever that uses Query Likelihood with Dirichlet smoothing over this index. Args: mu: The Dirichlet smoothing parameter. Defaults to 1000. num_results: The number of results to return. Defaults to 1000. include_fields: A list of the fields to include in the results. If `None` (default), no extra fields are included. If '*', all fields are included. verbose: Output verbose logging. Defaults to False. Returns: A transformer that can be used to retrieve documents from this index using qld. """ return pyterrier_anserini.AnseriniRetriever( index=self, similarity=AnseriniSimilarity.qld, similarity_args={'qld_mu': mu}, num_results=num_results, include_fields=self._resolve_fields(include_fields), verbose=verbose)
[docs] def tfidf(self, *, num_results: int = 1000, include_fields: Optional[_TFields] = None, verbose: bool = False ) -> pt.Transformer: """Provides a TF-IDF retriever over this index. Args: num_results: The number of results to return. Defaults to 1000. include_fields: A list of the fields to include in the results. If `None` (default), no extra fields are included. If '*', all fields are included. verbose: Output verbose logging. Defaults to False. Returns: A transformer that can be used to retrieve documents from this index using TF-IDF. """ return pyterrier_anserini.AnseriniRetriever( index=self, similarity=AnseriniSimilarity.tfidf, num_results=num_results, include_fields=self._resolve_fields(include_fields), verbose=verbose)
[docs] def impact(self, *, num_results: int = 1000, include_fields: Optional[_TFields] = None, verbose: bool = False ) -> pt.Transformer: """Provides a retriever for pre-comptued impact scores. This is called "quantized" scoring in PISA or "TF" scoring in Terrier. Args: num_results: The number of results to return. Defaults to 1000. include_fields: A list of the fields to include in the results. If `None` (default), no extra fields are included. If '*', all fields are included. verbose: Output verbose logging. Defaults to False. Returns: A transformer that can be used to retrieve documents from this index using impact scores. """ return pyterrier_anserini.AnseriniRetriever( index=self, similarity=AnseriniSimilarity.impact, num_results=num_results, include_fields=self._resolve_fields(include_fields), verbose=verbose)
[docs] def reranker(self, similarity: Union[str, AnseriniSimilarity], similarity_args: Optional[Dict[str, Any]] = None, *, verbose: bool = False ) -> pt.Transformer: """Provides a reranker that uses the specified weithing model. Args: similarity: The similarity function to use. similarity_args: The arguments to the similarity function. Defaults to None (no arguments). verbose: Output verbose logging. Defaults to False. Returns: A transformer that can be used to score (rerank) documents from this index. """ return pyterrier_anserini.AnseriniReRanker( index=self, similarity=similarity, similarity_args=similarity_args, verbose=verbose)
[docs] def text_loader(self, fields: Union[List[str], str, Literal['*']] = '*', *, verbose: bool = False, ) -> pt.Transformer: """Provides a transformer that can be used to load the text from this index for each document. Args: fields: The fields to extract. When the literal '*' (default), extracts all available fields. verbose: Output verbose logging. Defaults to False. Returns: A transformer that can be used to load the text from this index for each document. """ return pyterrier_anserini.AnseriniTextLoader( index=self, fields=self._resolve_fields(fields), verbose=verbose)
def _searcher(self): from pyserini.search.lucene import LuceneSearcher assert self.built(), "a searcher object can only be created if the index is built" return LuceneSearcher(self.path) def fields(self) -> List[str]: field_info = J.IndexReaderUtils.getFieldInfo(self._searcher().object.reader) return [k for k in field_info if k != 'id'] def _resolve_fields(self, fields: Optional[_TFields]) -> Optional[List[str]]: if fields is None: return None if fields == '*': return self.fields() if isinstance(fields, str): return [fields] return fields def num_docs(self) -> int: return self._searcher().object.get_total_num_docs() def __repr__(self): return f"AnseriniIndex({self.path!r})"