Source code for pyterrier.terrier._text_loader
from typing import List, Union, Literal
import pandas as pd
import pyterrier as pt
from pyterrier.terrier._index import TerrierIndex
[docs]
class TerrierTextLoader(pt.Transformer):
"""A transformer that loads textual metadata from a Terrier index into a DataFrame by docid or docno."""
schematic = {'label': 'TextLoader'}
def __init__(
self,
index: TerrierIndex,
fields: Union[List[str], str, Literal['*']] = '*',
*,
verbose=False
):
"""Initialise the transformer with the index to load metadata from.
Args:
index (pyterrier.terrier.J.Index): The index to load metadata from.
fields: The fields to load from the index. If '*', all fields will be loaded.
verbose: Whether to print debug information.
"""
metaindex = index.index_obj().getMetaIndex()
if metaindex is None:
raise ValueError(f"Index {index} does not have a metaindex")
available_fields = list(metaindex.getKeys())
if fields == '*':
fields = available_fields
else:
if isinstance(fields, str):
fields = [fields]
missing_fields = set(fields) - set(available_fields)
if missing_fields:
raise ValueError(f"Index from {index} did not have requested metaindex keys {list(missing_fields)}. "
f"Keys present in metaindex are {available_fields}")
self._index = index
self.fields = fields
self.verbose = verbose
[docs]
def transform(self, inp: pd.DataFrame) -> pd.DataFrame:
"""Load metadata from the index into the input DataFrame.
Args:
inp: The input DataFrame. Must contain either 'docid' or 'docno'.
Returns:
A new DataFrame with the metadata columns appended.
"""
if 'docno' not in inp.columns and 'docid' not in inp.columns:
raise ValueError(f"Neither docid nor docno are in the input dataframe, found {list(inp.columns)}")
metaindex = self._index.index_obj().getMetaIndex()
# Get the docids
if "docid" not in inp.columns:
# Look up docids by docno
docids = inp.docno.map(lambda docno: metaindex.getDocument("docno", docno))
else:
# Use the provided docids
docids = inp.docid
# Look up the metadata and build a new frame to append
docids = docids.values.tolist() # getItems expects a list
metadata_matrix = metaindex.getItems(self.fields, docids) # indexed by docid then keys
metadata_frame = pd.DataFrame(metadata_matrix, columns=self.fields)
# append the input and metadata frames
inp = inp.drop(columns=self.fields, errors='ignore') # make sure we don't end up with duplicates
inp = inp.reset_index(drop=True) # reset the index to default (matching metadata_frame)
return pd.concat([inp, metadata_frame], axis='columns')
def fuse_rank_cutoff(self, k):
return pt.RankCutoff(k) >> self
def terrier_text_loader(
index,
fields: Union[List[str], str, Literal['*']] = '*',
*,
verbose=False
) -> TerrierTextLoader:
"""Create a transformer that loads textual metadata from a Terrier index into a DataFrame by docid or docno.
Args:
index (str or pyterrier.terrier.J.IndexRef or pyterrier.terrier.J.Index): The index to load metadata from.
fields: The fields to load from the index. If '*', all fields will be loaded.
verbose: Whether to print debug information.
"""
index = pt.terrier.TerrierIndex.coerce(index)
return TerrierTextLoader(index, fields, verbose=verbose)