Source code for pyterrier_dr.prf

import numpy as np
import pandas as pd
import pyterrier as pt
import pyterrier_alpha as pta



[docs]
class VectorPrf(pt.Transformer):
    """
    Performs a Rocchio-esque PRF by linearly combining the query_vec column with 
    the doc_vec column of the top k documents.

    Arguments:
     - alpha: weight of original query_vec
     - beta: weight of doc_vec
     - k: number of pseudo-relevant feedback documents

    Expected Input Columns: ``['qid', 'query_vec', 'docno', 'doc_vec']``

    Output Columns: ``['qid', 'query_vec']`` (Any other query columns from the input are also pulled included in the output.)

    Example::
    
            prf_pipe = model >> index >> index.vec_loader() >> pyterier_dr.vector_prf() >> index 

    .. cite.dblp:: journals/tois/0009MZKZ23
    """
    def __init__(self,
        *,
        alpha: float = 1,
        beta: float = 0.2,
        k: int = 3
    ):
        self.alpha = alpha
        self.beta = beta
        self.k = k

    def compile(self) -> pt.Transformer:
        return pt.RankCutoff(self.k) >> self


[docs]
    @pta.transform.by_query(add_ranks=False)
    def transform(self, inp: pd.DataFrame) -> pd.DataFrame:
        """Performs Vector PRF on the input dataframe."""
        pta.validate.result_frame(inp, extra_columns=['query_vec', 'doc_vec'])

        query_cols = [col for col in inp.columns if col.startswith('q') and col != 'query_vec']

        # get the docvectors for the top k docs
        doc_vecs = np.stack([ row.doc_vec for row in inp.head(self.k).itertuples() ])
        # combine their average and add to the query
        query_vec = self.alpha * inp['query_vec'].iloc[0] + self.beta * np.mean(doc_vecs, axis=0)
        # generate new query dataframe with the existing query columns and the new query_vec
        return pd.DataFrame([[inp[c].iloc[0] for c in query_cols] + [query_vec]], columns=query_cols + ['query_vec'])


    def __repr__(self):
        return f"VectorPrf(alpha={self.alpha}, beta={self.beta}, k={self.k})"




[docs]
class AveragePrf(pt.Transformer):
    """
    Performs Average PRF (as described by Li et al.) by averaging the query_vec column with 
    the doc_vec column of the top k documents.

    Arguments:
     - k: number of pseudo-relevant feedback documents

    Expected Input Columns: ``['qid', 'query_vec', 'docno', 'doc_vec']``

    Output Columns: ``['qid', 'query_vec']`` (Any other query columns from the input are also pulled included in the output.)

    Example::
    
            prf_pipe = model >> index >> index.vec_loader() >> pyterier_dr.average_prf() >> index 

    .. cite.dblp:: journals/tois/0009MZKZ23
    """
    def __init__(self,
        *,
        k: int = 3
    ):
        self.k = k

    def compile(self) -> pt.Transformer:
        return pt.RankCutoff(self.k) >> self
    

[docs]
    @pta.transform.by_query(add_ranks=False)
    def transform(self, inp: pd.DataFrame) -> pd.DataFrame:
        """Performs Average PRF on the input dataframe."""
        pta.validate.result_frame(inp, extra_columns=['query_vec', 'doc_vec'])

        query_cols = [col for col in inp.columns if col.startswith('q') and col != 'query_vec']

        # get the docvectors for the top k docs and the query_vec
        all_vecs = np.stack([inp['query_vec'].iloc[0]] + [row.doc_vec for row in inp.head(self.k).itertuples()])
        # combine their average and add to the query
        query_vec = np.mean(all_vecs, axis=0)
        # generate new query dataframe with the existing query columns and the new query_vec
        return pd.DataFrame([[inp[c].iloc[0] for c in query_cols] + [query_vec]], columns=query_cols + ['query_vec'])


    def __repr__(self):
        return f"AveragePrf(k={self.k})"