from . import Transformer, Estimator
from .apply import doc_score, doc_features
from .model import add_ranks
from typing import Sequence, Union, Tuple
import numpy as np, pandas as pd
FeatureList = Union[Sequence[int], int]
class AblateFeatures(Transformer):
def __init__(self, fids: FeatureList):
self.fids = fids if isinstance(fids, list) else [fids]
self.null = 0
def transform(self, topics_and_res):
def _reset(row):
fvalues = row["features"].copy()
for findex in self.fids:
fvalues[findex] = self.null
return fvalues
assert "features" in topics_and_res.columns
topics_and_res = topics_and_res.copy()
topics_and_res["features"] = topics_and_res.apply(_reset, axis=1)
return topics_and_res
class KeepFeatures(Transformer):
def __init__(self, fids : FeatureList):
self.fids = fids if isinstance(fids, list) else [fids]
self.null = 0
def transform(self, topics_and_res):
assert "features" in topics_and_res.columns
topics_and_res = topics_and_res.copy()
topics_and_res["features"] = topics_and_res.apply(lambda row: row["features"][self.fids], axis=1)
return topics_and_res
class RegressionTransformer(Estimator):
"""
This class simplifies the use of Scikit-learn's techniques for learning-to-rank.
"""
def __init__(self, learner, *args, fit_kwargs={}, **kwargs):
"""
Init method
Args:
LTR: The model which to use for learning-to-rank. Must have a fit() and predict() methods.
fit_kwargs: A dictionary containing additional arguments that can be passed to LTR's fit() method.
"""
self.fit_kwargs = fit_kwargs
super().__init__(*args, **kwargs)
self.learner = learner
self.num_f = None
def fit(self, topics_and_results_Train, qrelsTrain, topics_and_results_Valid=None, qrelsValid=None):
"""
Trains the model with the given topics.
Args:
topicsTrain(DataFrame): A dataframe with the topics to train the model
"""
if len(topics_and_results_Train) == 0:
raise ValueError("No topics to fit to")
if 'features' not in topics_and_results_Train.columns:
raise ValueError("No features column retrieved")
train_DF = topics_and_results_Train.merge(qrelsTrain, on=['qid', 'docno'], how='left').fillna(0)
kwargs = self.fit_kwargs
self.learner.fit(np.stack(train_DF["features"].values), train_DF["label"].values, **kwargs)
self.num_f = train_DF.iloc[0].features.shape[0]
return self
def transform(self, test_DF):
"""
Predicts the scores for the given topics.
Args:
topicsTest(DataFrame): A dataframe with the test topics.
"""
test_DF = test_DF.copy()
# check for change in number of features
found_numf = test_DF.iloc[0].features.shape[0]
if self.num_f is not None:
if found_numf != self.num_f:
raise ValueError("Expected %d features, but found %d features" % (self.num_f, found_numf))
if hasattr(self.learner, 'feature_importances_'):
if len(self.learner.feature_importances_) != found_numf:
raise ValueError("Expected %d features, but found %d features" % (len(self.learner.feature_importances_), found_numf))
test_DF["score"] = self.learner.predict(np.stack(test_DF["features"].values))
return add_ranks(test_DF)
class LTRTransformer(RegressionTransformer):
"""
This class simplifies the use of LightGBM and XGBoost for learning-to-rank.
"""
def fit(self, topics_and_results_Train, qrelsTrain, topics_and_results_Valid, qrelsValid):
"""
Trains the model with the given training and validation topics.
Args:
topics_and_results_Train(DataFrame): A dataframe with the topics and results to train the model
qrelsTrain(DataFrame): A dataframe containing the qrels for the training topics
topics_and_results_Valid(DataFrame): A dataframe with the topics and results for validation
qrelsValid(DataFrame): A dataframe containing the qrels for the validation topics
"""
def _count_by_topic(res : pd.DataFrame) -> Tuple[Sequence[int], pd.DataFrame]:
# we must ensure res and count_series have the same ordering
res = res.sort_values("qid")
count_series = res.groupby(["qid"], sort=False)["docno"].count().to_numpy()
return count_series, res
if topics_and_results_Train is None or len(topics_and_results_Train) == 0:
raise ValueError("No training results to fit to")
if topics_and_results_Valid is None or len(topics_and_results_Valid) == 0:
raise ValueError("No validation results to fit to")
if 'features' not in topics_and_results_Train.columns:
raise ValueError("No features column retrieved in training")
if 'features' not in topics_and_results_Valid.columns:
raise ValueError("No features column retrieved in validation")
tr_res = topics_and_results_Train.merge(qrelsTrain, on=['qid', 'docno'], how='left').fillna(0)
va_res = topics_and_results_Valid.merge(qrelsValid, on=['qid', 'docno'], how='left').fillna(0)
kwargs = self.fit_kwargs
# this enforces a sort on tr_res and va_res that matches the counts
counts_tr, tr_res = _count_by_topic(tr_res)
counts_va, va_res = _count_by_topic(va_res)
self.learner.fit(
np.stack(tr_res["features"].values), tr_res["label"].values,
group=counts_tr, # we name group here for lightgbm compat.
eval_set=[(np.stack(va_res["features"].values), va_res["label"].values)],
eval_group=[counts_va],
**kwargs
)
self.num_f = tr_res.iloc[0].features.shape[0]
class FastRankEstimator(Estimator):
"""
This class simplifies the use of FastRank's techniques for learning-to-rank.
"""
def __init__(self, learner, *args, **kwargs):
"""
Init method
Args:
learner: The model which to use for learning-to-rank. Must have a fit() and predict() methods.
"""
super().__init__(*args, **kwargs)
self.learner = learner
self.model = None
self.num_f = None
def _make_dataset(self, test_DF, add_labels = False):
from collections import defaultdict
from itertools import count
from fastrank import CDataset
qid_map = defaultdict(count().__next__)
features = np.stack(test_DF["features"].values).astype('float32')
qids = test_DF["qid"].apply(lambda qid : qid_map[qid]).values
if add_labels:
y = test_DF["label"].values
else:
y = np.zeros(len(test_DF))
dataset = CDataset.from_numpy(features, y, qids)
return dataset
def fit(self, topics_and_results_Train, qrelsTrain, topics_and_results_Valid=None, qrelsValid=None):
if topics_and_results_Train is None or len(topics_and_results_Train) == 0:
raise ValueError("No training results to fit to")
if 'features' not in topics_and_results_Train.columns:
raise ValueError("No features column retrieved in training")
tr_res = topics_and_results_Train.merge(qrelsTrain, on=['qid', 'docno'], how='left').fillna(0)
dataset = self._make_dataset(tr_res, add_labels=True)
self.num_f = dataset.num_features()
self.model = dataset.train_model(self.learner)
def transform(self, topics_and_docs_Test):
"""
Predicts the scores for the given topics.
Args:
topicsTest(DataFrame): A dataframe with the test topics.
"""
if self.model is None:
raise ValueError("fit() must be called first")
test_DF = topics_and_docs_Test.copy()
dataset = self._make_dataset(test_DF, add_labels=False)
# check for change in number of features
found_numf = dataset.num_features()
if self.num_f is not None and found_numf != self.num_f:
raise ValueError("Expected %d features, but found %d features" % (self.num_f, found_numf))
if hasattr(self.learner, 'feature_importances_'):
if len(self.learner.feature_importances_) != found_numf:
raise ValueError("Expected %d features, but found %d features" % (len(self.learner.feature_importances_), found_numf))
rtr = dataset.predict_scores(self.model)
scores = [rtr[i] for i in range(len(rtr))]
test_DF["score"] = scores
return add_ranks(test_DF)
[docs]def ablate_features(fids : FeatureList) -> Transformer:
"""
Ablates features (sets feature value to 0) from a pipeline. This is useful for
performing feature ablation studies, whereby a feature is removed from the pipeline
before learning.
Args:
fids: one or a list of integers corresponding to features indices to be removed
"""
return AblateFeatures(fids)
[docs]def keep_features(fids : FeatureList) -> Transformer:
"""
Reduces the features in a pipeline to only those mentioned. This is useful for
performing feature ablation studies, whereby only some features are kept
(and other removed) from a pipeline before learning occurs.
Args:
fids: one or a list of integers corresponding to the features indice to be kept
"""
return KeepFeatures(fids)
[docs]def feature_to_score(fid : int) -> Transformer:
"""
Applies a specified feature for ranking. Useful for evaluating which of a number of
pre-computed features are useful for ranking.
Args:
fid: a single feature id that should be kept
"""
return doc_score(lambda row : row["features"][fid])
[docs]def apply_learned_model(learner, form : str = 'regression', **kwargs) -> Transformer:
"""
Results in a transformer that can take in documents that have a "features" column,
and pass that to the specified learner via its transform() function, to obtain the
documents' "score" column. Learners should follow the sklearn's general pattern
with a fit() method (
c.f. an sklearn `Estimator <https://scikit-learn.org/stable/glossary.html#term-estimator>`_)
and a `predict() <https://scikit-learn.org/stable/glossary.html#term-predict>`_ method.
xgBoost and LightGBM are also supported through the use of `type='ltr'` kwarg.
Args:
learner: an sklearn-compatible estimator
form(str): either 'regression', 'ltr' or 'fastrank'
"""
if form == 'ltr':
return LTRTransformer(learner, **kwargs)
if form == 'fastrank':
return FastRankEstimator(learner, **kwargs)
return RegressionTransformer(learner, **kwargs)
[docs]def score_to_feature() -> Transformer:
"""
Takes the document's "score" from the score attribute, and uses it as a single feature.
In particular, a feature union operator does not use any score of the documents in the
candidate set as a ranking feaure. Using the resulting transformer within a feature-union
means that an additional ranking feature is added to the "feature" column.
Example::
cands = pt.BatchRetrieve(index, wmodel="BM25")
bm25f = pt.BatchRetrieve(index, wmodel="BM25F")
pl2f = pt.BatchRetrieve(index, wmodel="PL2F")
two_features = cands >> (bm25f ** pl2f)
three_features = cands >> (bm25f ** pl2f ** pt.ltr.score_to_feature())
"""
return doc_features(lambda row : np.array(row["score"]))