Source code for pyterrier._evaluation

import pandas as pd
from ir_measures import Measure
from typing import Literal, Union, Sequence, Callable, Tuple, IO, Dict, Any
from .. import Transformer
import types

MEASURE_TYPE=Union[str,Measure]
MEASURES_TYPE=Sequence[MEASURE_TYPE]
SAVEMODE_TYPE=Literal['reuse', 'overwrite', 'error', 'warn']
VALIDATE_TYPE = Literal['warn', 'error', 'ignore']

SYSTEM_OR_RESULTS_TYPE = Union[Transformer, pd.DataFrame]
SAVEFORMAT_TYPE = Union[Literal['trec'], types.ModuleType, Tuple[Callable[[IO], pd.DataFrame], Callable[[pd.DataFrame, IO], None]]]


NUMERIC_TYPE = Union[float,int,complex]
TEST_FN_TYPE = Callable[ [Sequence[NUMERIC_TYPE],Sequence[NUMERIC_TYPE]], Tuple[Any,NUMERIC_TYPE] ]

# we need types before imports relying on those types
from ._experiment import Experiment #noqa: E402
from ._grid import GridScan, GridSearch, KFoldGridSearch #noqa: E402


[docs]
def Evaluate(res : pd.DataFrame, qrels : pd.DataFrame, metrics : MEASURES_TYPE= ['map', 'ndcg'], perquery : bool = False) -> Dict:
    """
    Evaluate a single result dataframe with the given qrels. This method may be used as an alternative to
    ``pt.Experiment()`` for getting only the evaluation measurements given a single set of existing results.

    The PyTerrier-way is to use ``pt.Experiment()`` to evaluate a set of transformers, but this method is useful
    if you have a set of results already, and want to evaluate them without having to create a transformer pipeline.

    :param res: Either a dataframe with columns=['qid', 'docno', 'score'] or a dict {qid:{docno:score,},}
    :param qrels: Either a dataframe with columns=['qid','docno', 'label'] or a dict {qid:{docno:label,},}
    :param metrics: A list of strings specifying which evaluation metrics to use. Default=['map', 'ndcg']
    :param perquery: If true return each metric for each query, else return mean metrics. Default=False
    """
    if len(res) == 0:
        raise ValueError("No results for evaluation")
    
    from ._execution import _run_and_evaluate
    _, rtr = _run_and_evaluate(res, None, qrels, metrics, perquery=perquery)
    return rtr


__all__ = ["Experiment", "Evaluate", "GridScan", "GridSearch", "KFoldGridSearch"]