Source code for pyterrier_dr._mmr

import numpy as np
import pandas as pd
import pyterrier as pt
import pyterrier_alpha as pta


[docs] class MmrScorer(pt.Transformer): """An MMR (Maximal Marginal Relevance) scorer (i.e., re-ranker). The MMR scorer re-orders documents by balancing relevance (from the initial scores) and diversity (based on the similarity of the document vectors). .. cite.dblp:: conf/sigir/CarbonellG98 """ def __init__(self, *, Lambda: float = 0.5, norm_rel: bool = False, norm_sim: bool = False, drop_doc_vec: bool = True, verbose: bool = False): """ Args: Lambda: The balance parameter between relevance and diversity (default: 0.5) norm_rel: Whether to normalize relevance scores to [0, 1] (default: False) norm_sim: Whether to normalize similarity scores to [0, 1] (default: False) drop_doc_vec: Whether to drop the 'doc_vec' column after re-ranking (default: True) verbose: Whether to display verbose output (e.g., progress bars) (default: False) """ self.Lambda = Lambda self.norm_rel = norm_rel self.norm_sim = norm_sim self.drop_doc_vec = drop_doc_vec self.verbose = verbose def transform(self, inp: pd.DataFrame) -> pd.DataFrame: pta.validate.result_frame(inp, extra_columns=['doc_vec']) out = [] it = inp.groupby('qid') if self.verbose: it = pt.tqdm(it, unit='q', desc=repr(self)) for qid, frame in it: scores = frame['score'].values dvec_matrix = np.stack(frame['doc_vec']) dvec_matrix = dvec_matrix / np.linalg.norm(dvec_matrix, axis=1)[:, None] dvec_sims = dvec_matrix @ dvec_matrix.T if self.norm_rel: scores = (scores - scores.min()) / (scores.max() - scores.min()) if self.norm_sim: dvec_sims = (dvec_sims - dvec_sims.min()) / (dvec_sims.max() - dvec_sims.min()) marg_rels = np.zeros_like(scores) new_idxs = [] for _ in range(scores.shape[0]): mmr_scores = (self.Lambda * scores) - ((1 - self.Lambda) * marg_rels) idx = mmr_scores.argmax() new_idxs.append(idx) if marg_rels.shape[0] > 1: marg_rels = np.max(np.stack([marg_rels, dvec_sims[idx]]), axis=0) marg_rels[idx] = float('inf') # ignore this document from now on new_frame = frame.iloc[new_idxs].reset_index(drop=True).assign( score=-np.arange(len(new_idxs), dtype=float), rank=np.arange(len(new_idxs)) ) if self.drop_doc_vec: new_frame = new_frame.drop(columns='doc_vec') out.append(new_frame) return pd.concat(out, ignore_index=True) __repr__ = pta.transformer_repr