Source code for pyterrier_dr._ils
from typing import Optional, Iterable, Tuple
import numpy as np
import pandas as pd
import ir_measures
import pyterrier as pt
from pyterrier_dr import FlexIndex
[docs]
def ILS(index: FlexIndex, *, name: Optional[str] = None, verbose: bool = False) -> ir_measures.Measure: # noqa: N802
"""Create an ILS (Intra-List Similarity) measure calculated using the vectors in the provided index.
Higher scores indicate lower diversity in the results.
This measure supports the ``@k`` convention for applying a top-k cutoff before scoring.
Args:
index (FlexIndex): The index to use for loading document vectors.
name (str, optional): The name of the measure (default: "ILS").
verbose (bool, optional): Whether to display a progress bar.
Returns:
ir_measures.Measure: An ILS measure object.
.. cite.dblp:: conf/www/ZieglerMKL05
"""
return ir_measures.define(lambda qrels, results: _ils(results, index, verbose=verbose), name=name or 'ILS')
[docs]
def ils(results: pd.DataFrame, index: Optional[FlexIndex] = None, *, verbose: bool = False) -> Iterable[Tuple[str, float]]:
"""Calculate the ILS (Intra-List Similarity) of a set of results.
Higher scores indicate lower diversity in the results.
Args:
results: The result frame to calculate ILS for.
index: The index to use for loading document vectors. Required if `results` does not have a `doc_vec` column.
verbose: Whether to display a progress bar.
Returns:
Iterable[Tuple[str,float]]: An iterable of (qid, ILS) pairs.
.. cite.dblp:: conf/www/ZieglerMKL05
"""
return _ils(results.rename(columns={'docno': 'doc_id', 'qid': 'query_id'}), index, verbose=verbose)
def _ils(results: pd.DataFrame, index: Optional[FlexIndex] = None, *, verbose: bool = False) -> Iterable[Tuple[str, float]]:
res = {}
if index is not None:
results = index.vec_loader()(results.rename(columns={'doc_id': 'docno'}))
if 'doc_vec' not in results:
raise ValueError('You must provide index to ils() if results do not have a `doc_vec` column.')
it = results.groupby('query_id')
if verbose:
it = pt.tqdm(it, unit='q', desc='ILS')
for qid, frame in it:
if len(frame) > 1:
vec_matrix = np.stack(frame['doc_vec'])
vec_matrix = vec_matrix / np.linalg.norm(vec_matrix, axis=1)[:, None] # normalize vectors
vec_sims = vec_matrix @ vec_matrix.T
upper_right = np.triu_indices(vec_sims.shape[0], k=1)
res[qid] = np.mean(vec_sims[upper_right])
else:
res[qid] = 0.0 # ILS is ill-defined when there's only one item.
return res.items()