Source code for pyterrier._evaluation._experiment

import os
import warnings
import pandas as pd
from typing import Union, Dict, Tuple, Sequence, Literal, Optional, overload, Any
from warnings import warn

from ._exec_linear import linear_execution
from ._exec_tree import tree_execution
from ._rendering import EvaluationDataTuple, RenderFromPerQuery
from ._validation import _validate
from . import SYSTEM_OR_RESULTS_TYPE, MEASURES_TYPE, TEST_FN_TYPE, SAVEFORMAT_TYPE, SAVEMODE_TYPE, VALIDATE_TYPE
import pyterrier as pt

# perquery: bool, dataframe:true
@overload
def Experiment(
        retr_systems : Union[Sequence[SYSTEM_OR_RESULTS_TYPE], Dict[str, SYSTEM_OR_RESULTS_TYPE]],
        topics : pd.DataFrame,
        qrels : pd.DataFrame,
        eval_metrics : MEASURES_TYPE,
        names : Optional[Sequence[str]] = None,
        perquery : Union[Literal[False],Literal[True]] = False,
        dataframe : Literal[True] = True,
        batch_size : Optional[int] = None,
        filter_by_qrels : bool = False,
        filter_by_topics : bool = True,
        baseline : Optional[Union[int, str]] = None,
        test : Union[str,TEST_FN_TYPE] = "t",
        correction : Optional[str] = None,
        correction_alpha : float = 0.05,
        highlight : Optional[str] = None,
        round : Optional[Union[int,Dict[str,int]]] = None,
        verbose : Literal['auto', True, False] = 'auto',
        validate : VALIDATE_TYPE = 'warn',
        save_dir : Optional[str] = None,
        save_mode : SAVEMODE_TYPE = 'warn',
        save_format : SAVEFORMAT_TYPE = 'trec',
        precompute_prefix : bool = False,
        plan : Literal['linear', 'tree'] = 'linear',
        **kwargs) -> pd.DataFrame:
    ...

# perquery: bool, dataframe:False
@overload
def Experiment(
        retr_systems : Union[Sequence[SYSTEM_OR_RESULTS_TYPE], Dict[str, SYSTEM_OR_RESULTS_TYPE]],
        topics : pd.DataFrame,
        qrels : pd.DataFrame,
        eval_metrics : MEASURES_TYPE,
        names : Optional[Sequence[str]] = None,
        perquery : Union[Literal[False],Literal[True]] = False,
        dataframe : Literal[False] = False,
        batch_size : Optional[int] = None,
        filter_by_qrels : bool = False,
        filter_by_topics : bool = True,
        baseline : Optional[Union[int, str]] = None,
        test : Union[str,TEST_FN_TYPE] = "t",
        correction : Optional[str] = None,
        correction_alpha : float = 0.05,
        highlight : Optional[str] = None,
        round : Optional[Union[int,Dict[str,int]]] = None,
        verbose : Literal['auto', True, False] = 'auto',
        validate : VALIDATE_TYPE = 'warn',
        save_dir : Optional[str] = None,
        save_mode : SAVEMODE_TYPE = 'warn',
        save_format : SAVEFORMAT_TYPE = 'trec',
        precompute_prefix : bool = False,
        plan : Literal['linear', 'tree'] = 'linear',
        **kwargs) -> Dict[str,Any]:
    ...

# perquery: 'both', dataframe:True
@overload
def Experiment(
        retr_systems : Union[Sequence[SYSTEM_OR_RESULTS_TYPE], Dict[str, SYSTEM_OR_RESULTS_TYPE]],
        topics : pd.DataFrame,
        qrels : pd.DataFrame,
        eval_metrics : MEASURES_TYPE,
        names : Optional[Sequence[str]] = None,
        perquery : Literal['both'] = 'both',
        dataframe : Literal[True] = True,
        batch_size : Optional[int] = None,
        filter_by_qrels : bool = False,
        filter_by_topics : bool = True,
        baseline : Optional[Union[int, str]] = None,
        test : Union[str,TEST_FN_TYPE] = "t",
        correction : Optional[str] = None,
        correction_alpha : float = 0.05,
        highlight : Optional[str] = None,
        round : Optional[Union[int,Dict[str,int]]] = None,
        verbose : Literal['auto', True, False] = 'auto',
        validate : VALIDATE_TYPE = 'warn',
        save_dir : Optional[str] = None,
        save_mode : SAVEMODE_TYPE = 'warn',
        save_format : SAVEFORMAT_TYPE = 'trec',
        precompute_prefix : bool = False,
        plan : Literal['linear', 'tree'] = 'linear',
        **kwargs) -> Tuple[pd.DataFrame,pd.DataFrame]:
    ...

# perquery: 'both', dataframe:False
@overload
def Experiment(
        retr_systems : Union[Sequence[SYSTEM_OR_RESULTS_TYPE], Dict[str, SYSTEM_OR_RESULTS_TYPE]],
        topics : pd.DataFrame,
        qrels : pd.DataFrame,
        eval_metrics : MEASURES_TYPE,
        names : Optional[Sequence[str]] = None,
        perquery : Literal['both'] = 'both',
        dataframe : Literal[False] = False,
        batch_size : Optional[int] = None,
        filter_by_qrels : bool = False,
        filter_by_topics : bool = True,
        baseline : Optional[Union[int, str]] = None,
        test : Union[str,TEST_FN_TYPE] = "t",
        correction : Optional[str] = None,
        correction_alpha : float = 0.05,
        highlight : Optional[str] = None,
        round : Optional[Union[int,Dict[str,int]]] = None,
        verbose : Literal['auto', True, False] = 'auto',
        validate : VALIDATE_TYPE = 'warn',
        save_dir : Optional[str] = None,
        save_mode : SAVEMODE_TYPE = 'warn',
        save_format : SAVEFORMAT_TYPE = 'trec',
        precompute_prefix : bool = False,
        plan : Literal['linear', 'tree'] = 'linear',
        **kwargs) -> Tuple[Dict[str,Any], Dict[str,Any]]:
    ...


[docs]
def Experiment(
        retr_systems : Union[Sequence[SYSTEM_OR_RESULTS_TYPE], Dict[str, SYSTEM_OR_RESULTS_TYPE]],
        topics : pd.DataFrame,
        qrels : pd.DataFrame,
        eval_metrics : MEASURES_TYPE,
        names : Optional[Sequence[str]] = None,
        perquery : Union[bool, Literal['both']] = False,
        dataframe : Union[Literal[False], Literal[True]] = True,
        batch_size : Optional[int] = None,
        filter_by_qrels : bool = False,
        filter_by_topics : bool = True,
        baseline : Optional[Union[int, str]] = None,
        test : Union[str,TEST_FN_TYPE] = "t",
        correction : Optional[str] = None,
        correction_alpha : float = 0.05,
        highlight : Optional[str] = None,
        round : Optional[Union[int,Dict[str,int]]] = None,
        verbose : Literal['auto', True, False] = 'auto',
        validate : VALIDATE_TYPE = 'warn',
        save_dir : Optional[str] = None,
        save_mode : SAVEMODE_TYPE = 'warn',
        save_format : SAVEFORMAT_TYPE = 'trec',
        precompute_prefix : bool = False,
        plan : Literal['linear', 'tree'] = 'linear',
        **kwargs):
    """
    Allows easy comparison of multiple retrieval transformer pipelines using a common set of topics, and
    identical evaluation measures computed using the same qrels. In essence, each transformer is applied on 
    the provided set of topics. Then the named evaluation measures are computed for each system.

    :param retr_systems: A list of transformers to evaluate. If you already have the results for one 
        (or more) of your systems, a results dataframe can also be used here. Results produced by 
        the transformers must have "qid", "docno", "score", "rank" columns. A dict can also be provided,
        in which case keys are used as system names and values are the systems/results.
    :param topics: Either a path to a topics file or a pandas.Dataframe with columns=['qid', 'query']
    :param qrels: Either a path to a qrels file or a pandas.Dataframe with columns=['qid','docno', 'label']   
    :param eval_metrics: Which evaluation metrics to use. E.g. ['map']
    :param names: List of names for each retrieval system when presenting the results.
        Default=None. If None: Obtains the `str()` representation of each transformer as its name.
        Ignored when ``retr_systems`` is a dict.
    :param batch_size: If not None, evaluation is conducted in batches of batch_size topics. Default=None, which evaluates all topics at once. 
        Applying a batch_size is useful if you have large numbers of topics, and/or if your pipeline requires large amounts of temporary memory
        during a run.
    :param filter_by_qrels: If True, will drop topics from the topics dataframe that have qids not appearing in the qrels dataframe. 
    :param filter_by_topics: If True, will drop topics from the qrels dataframe that have qids not appearing in the topics dataframe. 
    :param perquery: If True return each metric for each query, if False, will return mean metrics across all queries. If both, will return both averages and perquery results in a tuple. Default=False.
    :param save_dir: If set to the name of a directory, the results of each transformer will be saved in TREC-formatted results file, whose 
        filename is based on the systems names (as specified by ``names`` kwarg). If the file exists and ``save_mode`` is set to "reuse", then the file
        will be used for evaluation rather than the transformer. Default is None, such that saving and loading from files is disabled.
        In addition, two CSV summary files are written to ``save_dir`` on every call: ``aggregated.csv`` (one row per system, one column per measure)
        and ``perquery.csv`` (long-format table with columns ``name``, ``qid``, ``measure``, ``value``).
        If either CSV already exists, rows for systems not in the current experiment are preserved, allowing results to accumulate
        across multiple calls to ``pt.Experiment`` that each evaluate different subsets of systems.
    :param save_mode: Defines how existing files are used when ``save_dir`` is set. If set to "reuse", then files will be preferred
        over transformers for evaluation. If set to "overwrite", existing files will be replaced. If set to "warn" or "error", the presence of any 
        existing file will cause a warning or error, respectively. Default is "warn".
    :param save_format: How are result being saved. Defaults to 'trec', which uses ``pt.io.read_results()`` and ``pt.io.write_results()`` for saving system outputs. 
        If TREC results format is insufficient, set ``save_format=pickle``. Alternatively, a tuple of read and write function can be specified, for instance, 
        ``save_format=(pandas.from_csv, pandas.DataFrame.to_csv)``, or even ``save_format=(pandas.from_parquet, pandas.DataFrame.to_parquet)``.
    :param dataframe: If True return results as a dataframe, else as a dictionary of dictionaries. Default=True.
    :param baseline: If set to the index of an item of the retr_system list, will calculate the number of queries 
        improved, degraded and the statistical significance (paired t-test p value) for each measure.
        When ``retr_systems`` is a dict, baseline can also be a system name (dict key).
        Default=None: If None, no additional columns will be added for each measure.
    :param test: Which significance testing approach to apply. Defaults to "t". Alternatives are "wilcoxon" - not typically used for IR experiments. A Callable can also be passed - it should
        follow the specification of `scipy.stats.ttest_rel() <https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_rel.html>`_, 
        i.e. it expect two arrays of numbers, and return an array or tuple, of which the second value will be placed in the p-value column.
    :param correction: Whether any multiple testing correction should be applied. E.g. 'bonferroni', 'holm', 'hs' aka 'holm-sidak'. Default is None.
        Additional columns are added denoting whether the null hypothesis can be rejected, and the corrected p value. 
        See `statsmodels.stats.multitest.multipletests() <https://www.statsmodels.org/dev/generated/statsmodels.stats.multitest.multipletests.html#statsmodels.stats.multitest.multipletests>`_
        for more information about available testing correction.
    :param correction_alpha: What alpha value for multiple testing correction. Default is 0.05.
    :param highlight: If `highlight="bold"`, highlights in bold the best measure value in each column; 
        if `highlight="color"` or `"colour"`, then the cell with the highest metric value will have a green background.
    :param round: How many decimal places to round each measure value to. This can also be a dictionary mapping measure name to number of decimal places.
        Default is None, which is no rounding.
    :param plan: Whether to execute the experiment using a 'linear' or 'tree' execution plan. The linear plan executes each system sequentially, 
        but does not allow for reuse of execution results between different systems. The tree plan identifies common prefixes between pipelines, 
        and executes each unique prefix only once, allowing for more faster experiments. Default is 'linear'.
    :param verbose: If True, progress is shown as systems (or systems*batches if batch_size is set) are executed. Default is False, except when ``plan='tree'`` in a notebook, 
        in which case a the tree-execution plan is shown by default.
    :param validate: If set to value other than 'ignore', each transformer is validated against the topics dataframe, to ensure that it produces the expected output columns.
        ``pt.inspect.transformer_outputs()`` is used to determine the output columns. If 'warn', then transformers whose output columns don't match the columns required 
        by the specified evaluation measures will product warnings; If 'error', then an error is produced. If a transformer cannot be inspected, a warning is produced.
    :param precompute_prefix:  Deprecated, use plan='tree' instead. If True, will precompute the common prefix of all retrieval systems, and use this to speed up evaluation. 
        Default is False. A tree execution plan is more efficient, as prefixes of different lengths can be reused, and is recommended for experiments with many systems.
        
    :return: A Dataframe/dict with each retrieval system with each metric evaluated, or alternatively a tuple with averages and perquery results. 
    """
    
    if isinstance(retr_systems, dict):
        names = list(retr_systems.keys())
        retr_systems = list(retr_systems.values())
        if isinstance(baseline, str):
            if baseline not in names:
                raise ValueError(f"Unknown baseline '{baseline}'. Valid options are: {', '.join(names)}")
            baseline = names.index(baseline)
    elif not isinstance(retr_systems, list):
        raise TypeError("Expected list or dict of transformers for retr_systems, instead received %s" % str(type(retr_systems)))
    elif isinstance(baseline, str):
        raise TypeError("baseline should be an int when retr_systems is a list")
    
    if precompute_prefix:
        warn(
            "precompute_prefix is deprecated. Use plan='tree' instead", DeprecationWarning, stacklevel=2)

    if len(kwargs):
        raise TypeError("Unknown kwargs: %s" % (str(list(kwargs.keys()))))

    if baseline is not None:
        assert int(baseline) >= 0 and int(baseline) < len(retr_systems)
        assert not perquery

    if isinstance(topics, str):
        if os.path.isfile(topics):
            topics = pt.io.read_topics(topics)
    if isinstance(qrels, str):
        if os.path.isfile(qrels):
            qrels = pt.io.read_qrels(qrels)

    if round is not None:
        if isinstance(round, int):
            assert round >= 0, "round argument should be integer >= 0, not %s" % str(round)
        elif isinstance(round, dict):
            assert not perquery, "Sorry, per-measure rounding only support when reporting means" 
            for k,v in round.items():
                assert isinstance(v, int) and v >= 0, "rounding number for measure %s should be integer >= 0, not %s" % (k, str(v))
        else:
            raise ValueError("Argument round should be an integer or a dictionary")

    if correction is not None and baseline is None:
        raise ValueError("Requested multiple testing correction, but no baseline was specified.")

    # drop queries not appear in the qrels
    if filter_by_qrels:
        # the commented variant would drop queries not having any RELEVANT labels
        # topics = topics.merge(qrels[qrels["label"] > 0][["qid"]].drop_duplicates())        
        topics = topics.merge(qrels[["qid"]].drop_duplicates())
        if len(topics) == 0:
            raise ValueError('There is no overlap between the qids found in the topics and qrels. If this is intentional, set filter_by_topics=False and filter_by_qrels=False.')

    # drop qrels not appear in the topics
    if filter_by_topics:
        qrels = qrels.merge(topics[["qid"]].drop_duplicates())
        if len(qrels) == 0:
            raise ValueError('There is no overlap between the qids found in the topics and qrels. If this is intentional, set filter_by_topics=False and filter_by_qrels=False.')

    from scipy import stats
    test_fn : TEST_FN_TYPE
    if test == "t":
        test_fn = stats.ttest_rel
    elif test == "wilcoxon":
        test_fn = stats.wilcoxon
    else:
        assert not isinstance(test, str), "Unknown test function name %s" % test
        test_fn = test
    
    # obtain system names if not specified
    if names is None:
        names = [str(system) for system in retr_systems]
    elif len(names) != len(retr_systems):
        raise ValueError("names should be the same length as retr_systems")

    # validate save_dir and resulting filenames
    if save_dir is not None:
        if plan == 'tree':
            raise ValueError("save_dir is not yet supported for tree execution plan")

        if not os.path.exists(save_dir):
            raise ValueError("save_dir %s does not exist" % save_dir)
        if not os.path.isdir(save_dir):
            raise ValueError("save_dir %s is not a directory" % save_dir)
        from ..io import ok_filename
        for n in names:
            if not ok_filename(n):
                raise ValueError("Name contains bad characters and save_dir is set, name is %s" % n)
        if len(set(names)) < len(names):
            raise ValueError("save_dir is set, but names are not unique. Use names= to set unique names")

    mrt_needed = False
    if "mrt" in eval_metrics:
        mrt_needed = True
        eval_metrics = list(eval_metrics).copy()
        eval_metrics.remove("mrt")

    # validate the transformers produce the expected columns
    _validate(retr_systems, topics, eval_metrics, names, validate)
    
    renderer = RenderFromPerQuery(names, 
                                  baseline=baseline, 
                                  test_fn=test_fn, 
                                  correction=correction, 
                                  correction_alpha=correction_alpha, 
                                  round=round, 
                                  precompute_time=0)
    
    def _is_notebook() -> bool:
        import sys 
        return 'google.colab' in sys.modules or pt.utils._get_notebook() is not None
    
    if plan == 'tree':
        if save_dir is not None:
            assert False
        
        # for tree execution, default verbose to 'notebook' when in a notebook, else 'terminal'
        tverbose : Literal['terminal', 'notebook', False]
        if verbose == 'auto' or verbose is True:
            tverbose = 'notebook' if _is_notebook() else 'terminal'
        else:
            assert verbose is False
            tverbose = verbose
        
        tree_execution(renderer, retr_systems, topics, qrels, eval_metrics, names, tverbose, save_dir, save_mode, save_format, batch_size, perquery is not False)
    else:
        # default verbose to False for linear execution
        lverbose : bool
        if verbose == 'auto':
            lverbose = False
        else:
            lverbose = verbose
        linear_execution(renderer, retr_systems, topics, qrels, eval_metrics, names, precompute_prefix, lverbose, save_dir, save_mode, save_format, batch_size, perquery is not False)


    if save_dir is not None:
        # always save aggregated and per-query results as CSV files regardless of perquery setting
        current_names_set = set(names)
        aggregated_path = os.path.join(save_dir, "aggregated.csv")
        perquery_path = os.path.join(save_dir, "perquery.csv")

        new_agg = renderer.averages(dataframe=True, mrt_needed=mrt_needed)
        new_pq = renderer.perquery(dataframe=True)

        # preserve rows for runs that exist in save_dir but are not part of the current experiment
        if os.path.exists(aggregated_path):
            old_agg = pd.read_csv(aggregated_path)
            old_cols = set(old_agg.columns)
            new_cols = set(new_agg.columns)
            if old_cols != new_cols:
                warnings.warn(
                    f"Evaluation measures differ between the existing '{aggregated_path}' "
                    f"(columns: {', '.join(sorted(old_cols))}) and the current run "
                    f"(columns: {', '.join(sorted(new_cols))}). "
                    "Missing values will be filled with NaN.",
                    stacklevel=2,
                )
            old_agg = old_agg[~old_agg["name"].isin(current_names_set)]
            if not old_agg.empty:
                new_agg = pd.concat([new_agg, old_agg], ignore_index=True)

        if os.path.exists(perquery_path):
            old_pq = pd.read_csv(perquery_path)
            old_pq = old_pq[~old_pq["name"].isin(current_names_set)]
            if not old_pq.empty:
                new_pq = pd.concat([new_pq, old_pq], ignore_index=True)

        new_agg.to_csv(aggregated_path, index=False)
        new_pq.to_csv(perquery_path, index=False)

    if not perquery:
        return renderer.averages(dataframe=dataframe, highlight=highlight, mrt_needed=mrt_needed)
    
    perquery_results = renderer.perquery(dataframe=dataframe)
    if perquery == 'both':
        average_results = renderer.averages(dataframe=dataframe, highlight=highlight)

        if dataframe:
            from typing import cast as tcast
            average_results_df = tcast(pd.DataFrame, average_results)
            perquery_results_df = tcast(pd.DataFrame, perquery_results)

            average_results_df.style.set_caption("Averages")
            perquery_results_df.style.set_caption("Per Query")
            return EvaluationDataTuple(average_results, perquery_results)
        return (average_results, perquery_results)
    return perquery_results