Source code for pyterrier_dr.prf

import numpy as np
import pandas as pd
import pyterrier as pt
import pyterrier_alpha as pta


[docs] class VectorPrf(pt.Transformer): """ Performs a Rocchio-esque PRF by linearly combining the query_vec column with the doc_vec column of the top k documents. Arguments: - alpha: weight of original query_vec - beta: weight of doc_vec - k: number of pseudo-relevant feedback documents Expected Input Columns: ``['qid', 'query_vec', 'docno', 'doc_vec']`` Output Columns: ``['qid', 'query_vec']`` (Any other query columns from the input are also pulled included in the output.) Example:: prf_pipe = model >> index >> index.vec_loader() >> pyterier_dr.vector_prf() >> index .. cite.dblp:: journals/tois/0009MZKZ23 """ def __init__(self, *, alpha: float = 1, beta: float = 0.2, k: int = 3 ): self.alpha = alpha self.beta = beta self.k = k def compile(self) -> pt.Transformer: return pt.RankCutoff(self.k) >> self
[docs] @pta.transform.by_query(add_ranks=False) def transform(self, inp: pd.DataFrame) -> pd.DataFrame: """Performs Vector PRF on the input dataframe.""" pta.validate.result_frame(inp, extra_columns=['query_vec', 'doc_vec']) query_cols = [col for col in inp.columns if col.startswith('q') and col != 'query_vec'] # get the docvectors for the top k docs doc_vecs = np.stack([ row.doc_vec for row in inp.head(self.k).itertuples() ]) # combine their average and add to the query query_vec = self.alpha * inp['query_vec'].iloc[0] + self.beta * np.mean(doc_vecs, axis=0) # generate new query dataframe with the existing query columns and the new query_vec return pd.DataFrame([[inp[c].iloc[0] for c in query_cols] + [query_vec]], columns=query_cols + ['query_vec'])
def __repr__(self): return f"VectorPrf(alpha={self.alpha}, beta={self.beta}, k={self.k})"
[docs] class AveragePrf(pt.Transformer): """ Performs Average PRF (as described by Li et al.) by averaging the query_vec column with the doc_vec column of the top k documents. Arguments: - k: number of pseudo-relevant feedback documents Expected Input Columns: ``['qid', 'query_vec', 'docno', 'doc_vec']`` Output Columns: ``['qid', 'query_vec']`` (Any other query columns from the input are also pulled included in the output.) Example:: prf_pipe = model >> index >> index.vec_loader() >> pyterier_dr.average_prf() >> index .. cite.dblp:: journals/tois/0009MZKZ23 """ def __init__(self, *, k: int = 3 ): self.k = k def compile(self) -> pt.Transformer: return pt.RankCutoff(self.k) >> self
[docs] @pta.transform.by_query(add_ranks=False) def transform(self, inp: pd.DataFrame) -> pd.DataFrame: """Performs Average PRF on the input dataframe.""" pta.validate.result_frame(inp, extra_columns=['query_vec', 'doc_vec']) query_cols = [col for col in inp.columns if col.startswith('q') and col != 'query_vec'] # get the docvectors for the top k docs and the query_vec all_vecs = np.stack([inp['query_vec'].iloc[0]] + [row.doc_vec for row in inp.head(self.k).itertuples()]) # combine their average and add to the query query_vec = np.mean(all_vecs, axis=0) # generate new query dataframe with the existing query columns and the new query_vec return pd.DataFrame([[inp[c].iloc[0] for c in query_cols] + [query_vec]], columns=query_cols + ['query_vec'])
def __repr__(self): return f"AveragePrf(k={self.k})"