Source code for pyterrier.new


from typing import Sequence, Union
import pandas as pd
from .model import add_ranks

[docs]def empty_Q() -> pd.DataFrame: """ Returns an empty dataframe with columns `["qid", "query"]`. """ return pd.DataFrame(columns=["qid", "query"])
[docs]def queries(queries : Union[str, Sequence[str]], qid : Union[str, Sequence[str]] = None, **others) -> pd.DataFrame: """ Creates a new queries dataframe. Will return a dataframe with the columns `["qid", "query"]`. Any further lists in others will also be added. Arguments: queries: The search queries. Either a string, for a single query, or a sequence (e.g. list of strings) qids: Corresponding query ids. Either a string, for a single query, or a sequence (e.g. list of strings). Must have same length as queries. others: A dictionary of other attributes to add to the query dataframe Examples:: # create a dataframe with one query, qid "1" one_query = pt.new.queries("what the noise was was the question") # create a dataframe with one query, qid "5" one_query = pt.new.queries("what the noise was was the question", 5) # create a dataframe with two queries one_query = pt.new.queries(["query text A", "query text B"], ["1", "2"]) # create a dataframe with two queries one_query = pt.new.queries(["query text A", "query text B"], ["1", "2"], categories=["catA", "catB"]) """ if type(queries) == str: if qid is None: qid = "1" assert type(qid) == str return pd.DataFrame({"qid" : [qid], "query" : [queries], **others}) if qid is None: qid = map(str, range(1, len(queries)+1)) return pd.DataFrame({"qid" : qid, "query" : queries, **others})
Q = queries
[docs]def empty_R() -> pd.DataFrame: """ Returns an empty dataframe with columns `["qid", "query", "docno", "rank", "score"]`. """ return pd.DataFrame([[]], columns=["qid", "query", "docno", "rank", "score"])
[docs]def ranked_documents( scores : Sequence[Sequence[float]], qid : Sequence[str] = None, docno=None, **others) -> pd.DataFrame: """ Creates a new ranked documents dataframe. Will return a dataframe with the columns `["qid", "docno", "score", "rank"]`. Any further lists in others will also be added. Arguments: scores: The scores of the retrieved documents. Must be a list of lists. qid: Corresponding query ids. Must have same length as the first dimension of scores. If omitted, documents, qids are computed as strings starting from "1" docno: Corresponding docnos. Must have same length as the first dimension of scores and each 2nd dimension must be the same as the number of documents retrieved. If omitted, docnos are computed as strings starting from "d1" for each query. others: A dictionary of other attributes to add to the query dataframe. Examples:: # one query, one document R1 = pt.new.ranked_documents([[1]]) # one query, two documents R2 = pt.new.ranked_documents([[1, 2]]) # two queries, one documents each R3 = pt.new.ranked_documents([[1], [2]]) # one query, one document, qid specified R4 = pt.new.ranked_documents([[1]], qid=["q100"]) # one query, one document, qid and docno specified R5 = pt.new.ranked_documents([[1]], qid=["q100"], docno=[["d20"]]) """ from itertools import chain import numpy as np if len(scores) == 0: return empty_R() rtr = None if type(scores[0]) == list: # multiple queries if qid is None: qid = list(map(str, range(1, len(scores)+1))) else: assert len(qid) == len(scores) qid = list(chain.from_iterable([ [q] * len(score_array) for q, score_array in zip(qid, scores) ])) if docno is None: docno = [ list(map(lambda i: "d%d" % i, range(1, len(score_array)+1) ) ) for score_array in scores ] else: assert len(docno) == len(scores) from itertools import chain rtr = pd.DataFrame(list(chain.from_iterable(scores)), columns=["score"]) rtr["docno"] = list(chain.from_iterable(docno)) rtr["qid"] = qid #construct = {"qid" : qid, "docno" : docno, "score" : scores} for k, v in others.items(): rtr[k] = list(chain.from_iterable(v)) #assert len(v) == len(scores), "kwarg %s had length %d but was expected to have length %d" % (k, len(v), len(scores)) #construct[k] = np.array( v ).flatten() #rtr = pd.DataFrame(construct) else: raise ValueError("We assume multiple documents, for now") return add_ranks(rtr)
R = ranked_documents