Source code for pyterrier_anserini._text_loader
from typing import List, Union
import pandas as pd
import pyterrier as pt
import pyterrier_alpha as pta
import pyterrier_anserini
from pyterrier_anserini import AnseriniIndex
[docs]
@pt.java.required
class AnseriniTextLoader(pt.Transformer):
"""A transformer that provides access to text fields from an Anserini index."""
def __init__(self,
index: Union[AnseriniIndex, str],
fields: List[str],
*,
verbose: bool = False):
"""Initializes the text loader.
Args:
index: The index to provide text from. If a string, an AnseriniIndex object is created for the path.
fields: The fields to load.
verbose: Whether to display a progress bar when providing text.
"""
self.index = index if isinstance(index, AnseriniIndex) else AnseriniIndex(index)
self.fields = fields
self.verbose = verbose
__repr__ = pta.transformer_repr
[docs]
def transform(self, inp: pd.DataFrame) -> pd.DataFrame:
"""Provides text from the index for each document in `inp`.
Args:
inp: A DataFrame with a 'docno' column containing document IDs.
"""
pta.validate.columns(inp, includes=['docno'])
utils = pyterrier_anserini.J.IndexReaderUtils
index_reader = self.index._searcher().object.reader
results = pta.DataFrameBuilder(['_index'] + self.fields)
it = enumerate(inp['docno'])
if self.verbose:
it = pt.tqdm(it, unit='d', total=len(inp), desc='AnseriniTextLoader')
for i, docno in it:
doc = utils.document(index_reader, docno)
res = {f: doc.get(f) for f in self.fields}
res['_index'] = i
results.extend(res)
return results.to_df(merge_on_index=inp)