import urllib.request
import os
import json
import pandas as pd
from .transformer import is_lambda
from abc import abstractmethod
import types
from collections import defaultdict
from typing import Union, Tuple, Dict, List, Literal, Optional
from warnings import warn
import requests
from .io import autoopen, touch
import pyterrier as pt
import tarfile
import zipfile
import ir_datasets
TERRIER_DATA_BASE="http://data.terrier.org/indices/"
STANDARD_TERRIER_INDEX_FILES = [
"data.direct.bf",
"data.document.fsarrayfile",
"data.inverted.bf",
"data.lexicon.fsomapfile",
"data.lexicon.fsomaphash",
"data.lexicon.fsomapid",
"data.meta.idx",
"data.meta.zdata",
"data.properties"
]
class GeneratorLen(object):
def __init__(self, gen, length):
self.gen = gen
self.length = length
def __len__(self):
return self.length
def __iter__(self):
return self.gen
[docs]
class Dataset():
"""
Represents a dataset (test collection) for indexing or retrieval. A common use-case is to use the Dataset within an Experiment::
dataset = pt.get_dataset("trec-robust-2004")
pt.Experiment([br1, br2], dataset.get_topics(), dataset.get_qrels(), eval_metrics=["map", "recip_rank"])
"""
def _configure(self, **kwargs):
pass
[docs]
def get_corpus(self):
"""
Returns the location of the files to allow indexing the corpus, i.e. it returns a list of filenames.
"""
pass
[docs]
@abstractmethod
def get_corpus_iter(self, verbose=True) -> pt.model.IterDict:
"""
Returns an iter of dicts for this collection. If verbose=True, a tqdm pbar shows the progress over this iterator.
"""
pass
[docs]
def get_corpus_lang(self) -> Union[str,None]:
"""
Returns the ISO 639-1 language code for the corpus, or None for multiple/other/unknown
"""
return None
[docs]
def get_index(self, variant=None, **kwargs):
"""
Returns the IndexRef of the index to allow retrieval. Only a few datasets provide indices ready made.
"""
pass
[docs]
@abstractmethod
def get_topics(self, variant=None) -> pd.DataFrame:
"""
Returns the topics, as a dataframe, ready for retrieval.
"""
pass
[docs]
def get_topics_lang(self) -> Union[str,None]:
"""
Returns the ISO 639-1 language code for the topics, or None for multiple/other/unknown
"""
return None
[docs]
@abstractmethod
def get_qrels(self, variant=None) -> pd.DataFrame:
"""
Returns the qrels, as a dataframe, ready for evaluation.
"""
pass
[docs]
def get_topicsqrels(self, variant=None) -> Tuple[pd.DataFrame,pd.DataFrame]:
"""
Returns both the topics and qrels in a tuple. This is useful for pt.Experiment().
"""
return (
self.get_topics(variant=variant),
self.get_qrels(variant=variant)
)
[docs]
def info_url(self):
"""
Returns a url that provides more information about this dataset.
"""
return None
[docs]
def get_results(self, variant=None) -> pd.DataFrame:
"""
Returns a standard result set provided by the dataset. This is useful for re-ranking experiments.
"""
return None
class RemoteDataset(Dataset):
def __init__(self, name, locations):
self.locations = locations
self.name = name
self.user = None
self.password = None
def _configure(self, **kwargs):
self.corpus_home = os.path.join(pt.io.pyterrier_home(), "corpora", self.name)
if 'user' in kwargs:
self.user = kwargs['user']
self.password = kwargs['password']
@staticmethod
def download(URLs : Union[str,List[str]], filename : str, **kwargs):
basename = os.path.basename(filename)
if isinstance(URLs, str):
URLs = [URLs]
finalattempt=len(URLs)-1
error = None
for i, url in enumerate(URLs):
try:
r = requests.get(url, allow_redirects=True, stream=True, **kwargs)
r.raise_for_status()
total = int(r.headers.get('content-length', 0))
with pt.io.finalized_open(filename, 'b') as file, pt.tqdm(
desc=basename,
total=total,
unit='iB',
unit_scale=True,
unit_divisor=1024,
) as bar:
for data in r.iter_content(chunk_size=1024):
size = file.write(data)
bar.update(size)
break
except Exception as e:
if error is not None:
e.__cause__ = error # chain errors to show all if fails
error = e
if i == finalattempt:
raise error
else:
warn(
"Problem fetching %s, resorting to next mirror" % url)
def _check_variant(self, component, variant=None):
name=self.name
if component not in self.locations:
raise ValueError("No %s in dataset %s" % (component, name))
if variant is None:
if not isinstance(self.locations[component], list):
raise ValueError("For %s in dataset %s, you must specify a variant. Available are: %s" % (component, name, str(list(self.locations[component].keys()))))
else:
if isinstance(self.locations[component], list):
raise ValueError("For %s in dataset %s, there are no variants, but you specified %s" % (component, name, variant))
if variant not in self.locations[component]:
raise ValueError("For %s in dataset %s, there is no variant %s. Available are: %s" % (component, name, variant, str(list(self.locations[component].keys()))))
def _get_one_file(self, component, variant=None):
filetype=None
name=self.name
self._check_variant(component, variant)
location = self.locations[component][0] if variant is None else self.locations[component][variant]
if is_lambda(location) or isinstance(location, types.FunctionType):
argcount = location.__code__.co_argcount
if argcount == 0:
return location()
elif argcount == 3:
return location(self, component, variant)
else:
raise TypeError("Expected function with 0 or 3 arguments for %s %s %s" % (component, name, variant))
local = location[0]
URL = location[1]
if len(location) > 2:
filetype = location[2]
if not os.path.exists(self.corpus_home):
os.makedirs(self.corpus_home)
local = os.path.join(self.corpus_home, local)
actualURL = URL if isinstance(URL, str) else URL[0]
if "#" in actualURL and not os.path.exists(local):
tarname, intarfile = actualURL.split("#")
assert "/" not in intarfile
assert ".tar" in tarname or ".tgz" in tarname or ".zip" in tarname
localtarfile, _ = self._get_one_file("tars", tarname)
extractor = zipfile.ZipFile if ".zip" in tarname else tarfile.open
with extractor(localtarfile, "r") as tarobj:
tarobj.extract(intarfile, path=self.corpus_home)
os.rename(os.path.join(self.corpus_home, intarfile), local)
return (local, filetype)
if not os.path.exists(local):
try:
print("Downloading %s %s to %s" % (self.name, component, local))
kwargs = {}
if self.user is not None:
kwargs["auth"]=(self.user, self.password)
RemoteDataset.download(URL, local, **kwargs)
except urllib.error.HTTPError as he:
raise ValueError("Could not fetch " + URL) from he
return (local, filetype)
def _get_all_files(self, component, variant=None, **kwargs):
if variant is None:
localDir = os.path.join(self.corpus_home, component)
else:
localDir = os.path.join(self.corpus_home, component, variant)
kwargs = {}
if self.user is not None:
kwargs["auth"]=(self.user, self.password)
direxists = os.path.exists(localDir)
location = self.locations[component]
if is_lambda(location) or isinstance(location, types.FunctionType):
# functions are expensive to call, normally another HTTP is needed.
# just assume we have everthing we need if we have the local directory already
# and it contains a .complete file.
if direxists and os.path.exists(os.path.join(localDir, ".complete")):
return localDir
# call the function, and get the file list
file_list = location(self, component, variant, **kwargs)
else:
file_list = self.locations[component] if variant is None else self.locations[component][variant]
if not direxists:
os.makedirs(localDir)
print("Downloading %s %s to %s" % (self.name, component, localDir))
# check for how much space is required and available space
def _totalsize(file_list):
total = -1
for f in file_list:
if len(f) > 2:
total += f[2]
if total != -1:
total += 1
return total
totalsize = _totalsize(file_list)
if totalsize > 0:
import shutil
total, used, free = shutil.disk_usage(localDir)
if free < totalsize:
raise ValueError("Insufficient freedisk space at %s to download index" % localDir)
if totalsize > 2 * 2**30:
warn(
"Downloading index of > 2GB.")
# all tarfiles that we will need to process
tarfiles = defaultdict(list)
for fileentry in file_list:
local = fileentry[0]
URL = fileentry[1]
assert "/" not in local, "cant handle / in %s, local name is %s" % (local)
expectedlength = -1
if len(fileentry) == 3:
expectedlength = fileentry[2]
local = os.path.join(localDir, local)
# if file exists and we know length, check if dowload is complete
fileexists = os.path.exists(local)
if fileexists and expectedlength >= 0:
length = os.stat(local).st_size
if expectedlength != length:
warn(
"Removing partial download of %s (expected %d bytes, found %d)" % (local, expectedlength, length ))
os.remove(local)
fileexists = False
if not fileexists:
if "#" in URL:
tarname, intarfile = URL.split("#")
assert ".tar" in tarname or ".tgz" in tarname or ".zip" in tarname, "I dont know how to decompress file %s" % tarname
localtarfile, _ = self._get_one_file("tars", tarname)
# append intarfile to the list of files to be extracted from localtarfile
tarfiles[localtarfile].append((intarfile, local))
else:
try:
RemoteDataset.download(URL, local, **kwargs)
except urllib.error.HTTPError as he:
raise ValueError("Could not fetch " + URL) from he
# verify file if exists
if expectedlength >= 0:
length = os.stat(local).st_size
if expectedlength != length:
raise ValueError("Failed download of %s to %s (expected %d bytes, found %d)" % (URL, local, expectedlength, length ))
# now extract all required files from each tar file
for localtarfile in tarfiles:
extractor = zipfile.ZipFile if ".zip" in tarname else tarfile.open
with extractor(localtarfile, "r") as tarobj:
# 5 is abrtary threshold - if we have lots of files to extract, give a progress bar. alternative would be delay=5?
iter = pt.tqdm(tarfiles[localtarfile], unit="file", desc="Extracting from " + localtarfile) if len(tarfiles[localtarfile]) > 5 else tarfiles[localtarfile]
for (intarfile, local) in iter:
tarobj.extract(intarfile, path=self.corpus_home)
local = os.path.join(self.corpus_home, local)
os.rename(os.path.join(self.corpus_home, intarfile), local)
#TODO, files /could/ be recompressed here to save space, if not already compressed
# finally, touch a file signifying that download has been completed
touch(os.path.join(localDir, ".complete"))
return localDir
def _describe_component(self, component):
if component not in self.locations:
return None
if isinstance(self.locations[component], list):
return True
if isinstance(self.locations[component], dict):
return list(self.locations[component].keys())
return True
def get_corpus(self, **kwargs):
return list(filter(lambda f : not f.endswith(".complete"), pt.io.find_files(self._get_all_files("corpus", **kwargs))))
def get_corpus_iter(self, **kwargs):
if "corpus_iter" not in self.locations:
raise ValueError("Cannot supply a corpus iterator on dataset %s" % self.name)
return self.locations["corpus_iter"](self, **kwargs)
def get_corpus_lang(self):
if 'corpus' in self.locations:
return 'en' # all are english
return None
def get_qrels(self, variant=None):
filename, type = self._get_one_file("qrels", variant)
if type == "direct":
return filename
return pt.io.read_qrels(filename)
def get_topics(self, variant=None, **kwargs):
file, filetype = self._get_one_file("topics", variant)
if filetype is None or filetype in pt.io.SUPPORTED_TOPICS_FORMATS:
return pt.io.read_topics(file, format=filetype, **kwargs)
elif filetype == "direct":
return file
raise ValueError("Unknown filetype %s for %s topics %s" % (filetype, self.name, variant))
def get_topics_lang(self):
if 'topics' in self.locations:
return 'en' # all are english
return None
def get_index(self, variant=None, **kwargs):
if self.name == "50pct" and variant is None:
variant="ex1"
thedir = self._get_all_files("index", variant=variant, **kwargs)
return thedir
def __repr__(self):
return "RemoteDataset for %s, with %s" % (self.name, str(list(self.locations.keys())))
def info_url(self):
return self.locations['info_url'] if "info_url" in self.locations else None
@pt.java.required
def _pt_tokeniser():
tokeniser = pt.terrier.J.Tokenizer.getTokeniser()
def pt_tokenise(text):
return ' '.join(tokeniser.getTokens(text))
return pt_tokenise
class IRDSDataset(Dataset):
def __init__(self, irds_id, defer_load=False):
self._irds_id = irds_id
self._irds_ref = None if defer_load else ir_datasets.load(self._irds_id)
def irds_ref(self):
if self._irds_ref is None:
self._irds_ref = ir_datasets.load(self._irds_id)
return self._irds_ref
def get_corpus(self):
raise NotImplementedError("IRDSDataset doesn't support get_corpus; use get_corpus_iter instead. If you "
"are indexing, get_corpus_iter should be used in conjunction with IterDictIndexer.")
def get_corpus_iter(self, verbose=True, start=0, count=None):
ds = self.irds_ref()
assert ds.has_docs(), f"{self._irds_id} doesn't support get_corpus_iter"
it = ds.docs_iter()
total = ds.docs_count()
# use slicing if requested
if start > 0 or count is not None:
if count is not None:
it = it[start:start+count]
total = count
else:
it = it[start:]
total -= start
# tqdm support
if verbose:
it = pt.tqdm(it, desc=f'{self._irds_id} documents', total=total)
# rewrite to follow pyterrier std
def gen():
for doc in it:
doc = doc._asdict()
# pyterrier uses "docno"
doc['docno'] = doc.pop('doc_id')
yield doc
# ensure we can provide accurate len
return GeneratorLen(gen(), total)
def get_corpus_lang(self):
ds = self.irds_ref()
if ds.has_docs():
return ds.docs_lang()
return None
def get_index(self, variant=None):
# this is only for indices where Terrier provides an index already
raise NotImplementedError("IRDSDataset doesn't support get_index")
def get_topics(self, variant=None, tokenise_query=True):
"""
Returns the topics, as a dataframe, ready for retrieval.
"""
ds = self.irds_ref()
assert ds.has_queries(), f"{self._irds_id} doesn't support get_topics"
qcls = ds.queries_cls()
assert variant is None or variant in qcls._fields[1:], f"{self._irds_id} only supports the following topic variants {qcls._fields[1:]}"
df = pd.DataFrame(ds.queries_iter())
df.rename(columns={"query_id": "qid"}, inplace=True) # pyterrier uses "qid"
if variant is not None:
# Some datasets have a query field called "query". We need to remove it or
# we'll end up with multiple "query" columns, which will cause problems
# because many components are written assuming no columns have the same name.
if variant != 'query' and 'query' in df.columns:
df.drop(columns=['query'], axis=1, inplace=True)
df.rename(columns={variant: "query"}, inplace=True) # user specified which version of the query they want
df.drop(columns=df.columns.difference(['qid','query']), axis=1, inplace=True)
elif len(qcls._fields) == 2:
# auto-rename single query field to "query" if there's only query_id and that field
df.rename(columns={qcls._fields[1]: "query"}, inplace=True)
else:
print(f'There are multiple query fields available: {qcls._fields[1:]}. To use with pyterrier, provide variant or modify dataframe to add query column.')
# apply pyterrier tokenisation (otherwise the queries may not play well with batchretrieve)
if tokenise_query and 'query' in df:
tokeniser = _pt_tokeniser()
df['query'] = df['query'].apply(tokeniser)
return df
def get_topics_lang(self):
ds = self.irds_ref()
if ds.has_queries():
return ds.queries_lang()
return None
def get_qrels(self, variant=None):
"""
Returns the qrels, as a dataframe, ready for evaluation.
"""
ds = self.irds_ref()
assert ds.has_qrels(), f"{self._irds_id} doesn't support get_qrels"
qrelcls = ds.qrels_cls()
qrel_fields = [f for f in qrelcls._fields if f not in ('query_id', 'doc_id', 'iteration')]
assert variant is None or variant in qrel_fields, f"{self._irds_id} only supports the following qrel variants {qrel_fields}"
df = pd.DataFrame(ds.qrels_iter())
# pyterrier uses "qid" and "docno"
df.rename(columns={
"query_id": "qid",
"doc_id": "docno"}, inplace=True)
# pyterrier uses "label"
if variant is not None:
df.rename(columns={variant: "label"}, inplace=True)
if len(qrel_fields) == 1:
# usually "relevance"
df.rename(columns={qrel_fields[0]: "label"}, inplace=True)
elif 'relevance' in qrel_fields:
print(f'There are multiple qrel fields available: {qrel_fields}. Defaulting to "relevance", but to use a different one, supply variant')
df.rename(columns={'relevance': "label"}, inplace=True)
else:
print(f'There are multiple qrel fields available: {qrel_fields}. To use with pyterrier, provide variant or modify dataframe to add query column.')
return df
def get_results(self, variant=None) -> pd.DataFrame:
"""
Returns a standard result set provided by the dataset. This is useful for re-ranking experiments.
"""
ds = self.irds_ref()
assert ds.has_scoreddocs(), f"{self._irds_id} doesn't support get_reranking_run"
result = pd.DataFrame(ds.scoreddocs)
result = result.rename(columns={'query_id': 'qid', 'doc_id': 'docno'}) # convert irds field names to pyterrier names
result.sort_values(by=['qid', 'score', 'docno'], ascending=[True, False, True], inplace=True) # ensure data is sorted by qid, -score, did
# result doesn't yet contain queries (only qids) so load and merge them in
topics = self.get_topics(variant)
result = pd.merge(result, topics, how='left', on='qid')
return result
def _describe_component(self, component):
ds = self.irds_ref()
if component == "topics":
if ds.has_queries():
fields = ds.queries_cls()._fields[1:]
if len(fields) > 1:
return list(fields)
return True
return None
if component == "qrels":
if ds.has_qrels():
fields = [f for f in ds.qrels_cls()._fields if f not in ('query_id', 'doc_id', 'iteration')]
if len(fields) > 1:
return list(fields)
return True
return None
if component == "corpus":
return ds.has_docs() or None
if component == "results":
return ds.has_scoreddocs() or None
return None
def info_url(self):
top_id = self._irds_id.split('/', 1)[0]
suffix = f'#{self._irds_id}' if top_id != self._irds_id else ''
return f'https://ir-datasets.com/{top_id}.html{suffix}'
def __repr__(self):
return f"IRDSDataset({repr(self._irds_id)})"
def text_loader(
self,
fields: Union[List[str], str, Literal['*']] = '*',
*,
verbose: bool = False,
) -> pt.Transformer:
"""Create a transformer that loads text fields from an ir_datasets dataset into a DataFrame by docno.
Args:
fields: The fields to load from the dataset. If '*', all fields will be loaded.
verbose: Whether to print debug information.
"""
return IRDSTextLoader(self, fields, verbose=verbose)
class IRDSTextLoader(pt.Transformer):
"""A transformer that loads text fields from an ir_datasets dataset into a DataFrame by docno."""
def __init__(
self,
dataset: IRDSDataset,
fields: Union[List[str], str, Literal['*']] = '*',
*,
verbose=False
):
"""Initialise the transformer with the index to load metadata from.
Args:
dataset: The dataset to load text from.
fields: The fields to load from the dataset. If '*', all fields will be loaded.
verbose: Whether to print debug information.
"""
if not dataset.irds_ref().has_docs():
raise ValueError(f"Dataset {dataset} does not provide docs")
docs_cls = dataset.irds_ref().docs_cls()
available_fields = [f for f in docs_cls._fields if f != 'doc_id' and docs_cls.__annotations__[f] is str]
if fields == '*':
fields = available_fields
else:
if isinstance(fields, str):
fields = [fields]
missing_fields = set(fields) - set(available_fields)
if missing_fields:
raise ValueError(f"Dataset {dataset} did not have requested metaindex keys {list(missing_fields)}. "
f"Keys present in metaindex are {available_fields}")
self.dataset = dataset
self.fields = fields
self.verbose = verbose
def transform(self, inp: pd.DataFrame) -> pd.DataFrame:
"""Load text fields from the dataset into the input DataFrame.
Args:
inp: The input DataFrame. Must contain 'docno'.
Returns:
A new DataFrame with the text columns appended.
"""
if 'docno' not in inp.columns:
raise ValueError(f"input missing 'docno' column, available columns: {list(inp.columns)}")
irds = self.dataset.irds_ref()
docstore = irds.docs_store()
docnos = inp.docno.values.tolist()
# Load the new data
fields = ['doc_id'] + self.fields
set_docnos = set(docnos)
it = (tuple(getattr(doc, f) for f in fields) for doc in docstore.get_many_iter(set_docnos))
if self.verbose:
it = pt.tqdm(it, unit='d', total=len(set_docnos), desc='IRDSTextLoader')
metadata = pd.DataFrame(list(it), columns=fields).set_index('doc_id')
metadata_frame = metadata.loc[docnos].reset_index(drop=True)
# append the input and metadata frames
inp = inp.drop(columns=self.fields, errors='ignore') # make sure we don't end up with duplicates
inp = inp.reset_index(drop=True) # reset the index to default (matching metadata_frame)
return pd.concat([inp, metadata_frame], axis='columns')
def passage_generate(dataset):
for filename in dataset.get_corpus():
with autoopen(filename, 'rt') as corpusfile:
for line in corpusfile: #for each line
docno, passage = line.split("\t")
yield {'docno' : docno, 'text' : passage}
def _datarepo_index(self, component, variant=None, version='latest', **kwargs):
if variant is None:
raise ValueError(f"Must specify index variant for {self.name}. See http://data.terrier.org/{self.name}.dataset.html")
urlprefix= f"http://data.terrier.org/indices/{self.name}/{variant}/{version}/"
url = urlprefix + "files"
try:
r = requests.get(url, **kwargs)
r.raise_for_status()
file = r.text.splitlines()
except Exception as e:
raise ValueError(f"Could not find index variant {variant} for dataset {self.name} at {url}. See available variants at http://data.terrier.org/{self.name}.dataset.html") from e
rtr = []
import re
for linenum, line in enumerate(file):
# skip comments
if line.startswith("#"):
continue
try:
(length, filename) = re.split(r"\s+", line.strip(), 2)
rtr.append((filename, urlprefix+filename, int(length)))
except Exception as e:
raise ValueError(f"Could not parse {url} line {linenum} '{line}'") from e
return rtr
def _datarepo_index_default_none(self, component, variant=None, version='latest', **kwargs):
"""
For backward compatability with vaswani - use default for variant
"""
if variant is None:
variant = 'terrier_stemmed'
return _datarepo_index(self, component, variant=variant, version=version, **kwargs)
ANTIQUE_FILES = {
"topics" : {
"train" : ("antique-train-queries.txt", "http://ciir.cs.umass.edu/downloads/Antique/antique-train-queries.txt", "singleline"),
"test" : ("antique-test-queries.txt", "http://ciir.cs.umass.edu/downloads/Antique/antique-test-queries.txt", "singleline"),
},
"qrels" : {
"train" : ("antique-train.qrel", "http://ciir.cs.umass.edu/downloads/Antique/antique-train.qrel", "singleline"),
"test" : ("antique-test.qrel", "http://ciir.cs.umass.edu/downloads/Antique/antique-test.qrel", "singleline"),
},
"corpus" :
[("antique-collection.txt", "http://ciir.cs.umass.edu/downloads/Antique/antique-collection.txt")],
"info_url" : "https://ciir.cs.umass.edu/downloads/Antique/readme.txt",
"corpus_iter" : passage_generate
}
TREC_COVID_FILES = {
"topics" : {
"round1" : ("topics-rnd1.xml", "https://ir.nist.gov/covidSubmit/data/topics-rnd1.xml", "trecxml"),
"round2" : ("topics-rnd2.xml", "https://ir.nist.gov/covidSubmit/data/topics-rnd2.xml", "trecxml"),
"round3" : ("topics-rnd3.xml", "https://ir.nist.gov/covidSubmit/data/topics-rnd3.xml", "trecxml"),
"round4" : ("topics-rnd4.xml", "https://ir.nist.gov/covidSubmit/data/topics-rnd4.xml", "trecxml"),
"round5" : ("topics-rnd5.xml", "https://ir.nist.gov/covidSubmit/data/topics-rnd5.xml", "trecxml"),
},
"qrels" : {
"round1" : ("qrels-rnd1.txt", "https://ir.nist.gov/covidSubmit/data/qrels-rnd1.txt"),
"round2" : ("qrels-rnd2.txt", "https://ir.nist.gov/covidSubmit/data/qrels-rnd2.txt"),
"round3" : ("qrels-rnd3.txt", "https://ir.nist.gov/covidSubmit/data/qrels-covid_d3_j2.5-3.txt"),
"round3-cumulative" : ("qrels-rnd3-cumulative.txt", "https://ir.nist.gov/covidSubmit/data/qrels-covid_d3_j0.5-3.txt"),
"round4" : ("qrels-rnd4.txt", "https://ir.nist.gov/covidSubmit/data/qrels-covid_d4_j3.5-4.txt"),
"round4-cumulative" : ("qrels-rnd4-cumulative.txt", "https://ir.nist.gov/covidSubmit/data/qrels-covid_d4_j0.5-4.txt"),
"round5" : ("qrels-covid_d5_j4.5-5.txt", "https://ir.nist.gov/covidSubmit/data/qrels-covid_d5_j4.5-5.txt"),
},
"corpus" : {
"round4": ("round4.tar.gz", "https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/historical_releases/cord-19_2020-06-19.tar.gz"),
"round5": ("round5.tar.gz", "https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/historical_releases/cord-19_2020-07-16.tar.gz"),
},
"docids" : {
"docids-rnd3" : ("docids-rnd3.txt", "https://ir.nist.gov/covidSubmit/data/docids-rnd3.txt"),
"docids-rnd4" : ("docids-rnd4.txt", "https://ir.nist.gov/covidSubmit/data/docids-rnd4.txt"),
"docids-rnd5" : ("docids-rnd5.txt", "https://ir.nist.gov/covidSubmit/data/docids-rnd5.txt")
},
"info_url" : "https://ir.nist.gov/covidSubmit/",
"index": _datarepo_index
}
def msmarco_document_generate(dataset):
for filename in dataset.get_corpus(variant="corpus-tsv"):
with autoopen(filename, 'rt') as corpusfile:
for line in corpusfile: #for each line
docno, url, title, passage = line.split("\t")
yield {'docno' : docno, 'url' : url, 'title' : title, 'text' : passage}
MSMARCO_DOC_FILES = {
"corpus" :
[("msmarco-docs.trec.gz", "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-docs.trec.gz")],
"corpus-tsv":
[("msmarco-docs.tsv.gz", "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-docs.tsv.gz")],
"topics" :
{
"train" : ("msmarco-doctrain-queries.tsv.gz", "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-doctrain-queries.tsv.gz", "singleline"),
"dev" : ("msmarco-docdev-queries.tsv.gz", "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-docdev-queries.tsv.gz", "singleline"),
"test" : ("msmarco-test2019-queries.tsv.gz", "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-test2019-queries.tsv.gz", "singleline"),
"test-2020" : ("msmarco-test2020-queries.tsv.gz" , "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-test2020-queries.tsv.gz", "singleline"),
'leaderboard-2020' : ("docleaderboard-queries.tsv.gz" , "https://msmarco.z22.web.core.windows.net/msmarcoranking/docleaderboard-queries.tsv.gz", "singleline")
},
"qrels" :
{
"train" : ("msmarco-doctrain-qrels.tsv.gz", "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-doctrain-qrels.tsv.gz"),
"dev" : ("msmarco-docdev-qrels.tsv.gz", "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-docdev-qrels.tsv.gz"),
"test" : ("2019qrels-docs.txt", "https://trec.nist.gov/data/deep/2019qrels-docs.txt"),
"test-2020" : ("2020qrels-docs.txt", "https://trec.nist.gov/data/deep/2020qrels-docs.txt")
},
"info_url" : "https://microsoft.github.io/msmarco/",
"corpus_iter" : msmarco_document_generate,
"index" : _datarepo_index
}
MSMARCO_PASSAGE_FILES = {
"corpus" :
[("collection.tsv", "collection.tar.gz#collection.tsv")],
"topics" :
{
"train" : ("queries.train.tsv", "queries.tar.gz#queries.train.tsv", "singleline"),
"dev" : ("queries.dev.tsv", "queries.tar.gz#queries.dev.tsv", "singleline"),
"dev.small" : ("queries.dev.small.tsv", "collectionandqueries.tar.gz#queries.dev.small.tsv", "singleline"),
"eval" : ("queries.eval.tsv", "queries.tar.gz#queries.eval.tsv", "singleline"),
"eval.small" : ("queries.eval.small.tsv", "collectionandqueries.tar.gz#queries.eval.small.tsv", "singleline"),
"test-2019" : ("msmarco-test2019-queries.tsv.gz", "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-test2019-queries.tsv.gz", "singleline"),
"test-2020" : ("msmarco-test2020-queries.tsv.gz", "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-test2020-queries.tsv.gz", "singleline")
},
"tars" : {
"queries.tar.gz" : ("queries.tar.gz", "https://msmarco.z22.web.core.windows.net/msmarcoranking/queries.tar.gz"),
"collection.tar.gz" : ("collection.tar.gz", "https://msmarco.z22.web.core.windows.net/msmarcoranking/collection.tar.gz"),
"collectionandqueries.tar.gz" : ("collectionandqueries.tar.gz", "https://msmarco.z22.web.core.windows.net/msmarcoranking/collectionandqueries.tar.gz")
},
"qrels" :
{
"train" : ("qrels.train.tsv", "https://msmarco.z22.web.core.windows.net/msmarcoranking/qrels.train.tsv"),
"dev" : ("qrels.dev.tsv", "https://msmarco.z22.web.core.windows.net/msmarcoranking/qrels.dev.tsv"),
"test-2019" : ("2019qrels-docs.txt", "https://trec.nist.gov/data/deep/2019qrels-pass.txt"),
"test-2020" : ("2020qrels-docs.txt", "https://trec.nist.gov/data/deep/2020qrels-pass.txt"),
"dev.small" : ("qrels.dev.small.tsv", "collectionandqueries.tar.gz#qrels.dev.small.tsv"),
},
"info_url" : "https://microsoft.github.io/MSMARCO-Passage-Ranking/",
"corpus_iter" : passage_generate,
"index" : _datarepo_index
}
MSMARCOv2_DOC_FILES = {
"info_url" : "https://microsoft.github.io/msmarco/TREC-Deep-Learning.html",
"topics" : {
"train" : ("docv2_train_queries.tsv", "https://msmarco.z22.web.core.windows.net/msmarcoranking/docv2_train_queries.tsv", "singleline"),
"dev1" :("docv2_dev_queries.tsv", "https://msmarco.z22.web.core.windows.net/msmarcoranking/docv2_dev_queries.tsv", "singleline"),
"dev2" :("docv2_dev2_queries.tsv", "https://msmarco.z22.web.core.windows.net/msmarcoranking/docv2_dev2_queries.tsv", "singleline"),
"valid1" : ("msmarco-test2019-queries.tsv.gz" , "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-test2019-queries.tsv.gz", "singleline"),
"valid2" : ("msmarco-test2020-queries.tsv.gz" , "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-test2020-queries.tsv.gz", "singleline"),
"trec_2021" : ("2021_queries.tsv" , "https://msmarco.z22.web.core.windows.net/msmarcoranking/2021_queries.tsv", "singleline"),
},
"qrels" : {
"train" : ("docv2_train_qrels.tsv", "https://msmarco.z22.web.core.windows.net/msmarcoranking/docv2_train_qrels.tsv"),
"dev1" :("docv2_dev_qrels.tsv", "https://msmarco.z22.web.core.windows.net/msmarcoranking/docv2_dev_qrels.tsv"),
"dev2" :("docv2_dev2_qrels.tsv", "https://msmarco.z22.web.core.windows.net/msmarcoranking/docv2_dev2_qrels.tsv"),
"valid1" : ("docv2_trec2019_qrels.txt.gz" , "https://msmarco.z22.web.core.windows.net/msmarcoranking/docv2_trec2019_qrels.txt.gz"),
"valid2" : ("docv2_trec2020_qrels.txt.gz" , "https://msmarco.z22.web.core.windows.net/msmarcoranking/docv2_trec2020_qrels.txt.gz")
},
"index" : _datarepo_index,
}
MSMARCOv2_PASSAGE_FILES = {
"info_url" : "https://microsoft.github.io/msmarco/TREC-Deep-Learning.html",
"topics" : {
"train" : ("passv2_train_queries.tsv", "https://msmarco.z22.web.core.windows.net/msmarcoranking/passv2_train_queries.tsv", "singleline"),
"dev1" : ("passv2_dev_queries.tsv", "https://msmarco.z22.web.core.windows.net/msmarcoranking/passv2_dev_queries.tsv", "singleline"),
"dev2" : ("passv2_dev2_queries.tsv", "https://msmarco.z22.web.core.windows.net/msmarcoranking/passv2_dev2_queries.tsv", "singleline"),
"trec_2021" : ("2021_queries.tsv" , "https://msmarco.z22.web.core.windows.net/msmarcoranking/2021_queries.tsv", "singleline"),
},
"qrels" : {
"train" : ("passv2_train_qrels.tsv" "https://msmarco.z22.web.core.windows.net/msmarcoranking/passv2_train_qrels.tsv"),
"dev1" : ("passv2_dev_qrels.tsv", "https://msmarco.z22.web.core.windows.net/msmarcoranking/passv2_dev_qrels.tsv"),
"dev2" : ("passv2_dev2_qrels.tsv", "https://msmarco.z22.web.core.windows.net/msmarcoranking/passv2_dev2_qrels.tsv"),
},
"index" : _datarepo_index,
}
# remove WT- prefix from topics
def remove_prefix(self, component, variant):
topics_file, type = self._get_one_file("topics_prefixed", variant)
if type in pt.io.SUPPORTED_TOPICS_FORMATS:
topics = pt.io.read_topics(topics_file, type)
else:
raise ValueError("Unknown topic type %s" % type)
topics["qid"] = topics.apply(lambda row: row["qid"].split("-")[1], axis=1)
return (topics, "direct")
# a function to fix the namedpage TREC Web tracks 2001 and 2002
def parse_desc_only(self, component, variant):
file, type = self._get_one_file("topics_desc_only", variant=variant)
topics = pt.io.read_topics(file, format="trec", whitelist=["DESC"], blacklist=None)
topics["qid"] = topics.apply(lambda row: row["qid"].replace("NP", ""), axis=1)
topics["qid"] = topics.apply(lambda row: row["qid"].replace("EP", ""), axis=1)
return (topics, "direct")
TREC_WT_2002_FILES = {
"topics" :
{
"td" : ("webtopics_551-600.txt.gz", "https://trec.nist.gov/data/topics_eng/webtopics_551-600.txt.gz", "trec"),
"np" : parse_desc_only
},
"topics_desc_only" : {
"np" : ("webnamed_page_topics.1-150.txt.gz", "https://trec.nist.gov/data/topics_eng/webnamed_page_topics.1-150.txt.gz", "trec")
},
"qrels" :
{
"np" : ("qrels.named-page.txt.gz", "https://trec.nist.gov/data/qrels_eng/qrels.named-page.txt.gz"),
"td" : ("qrels.distillation.txt.gz", "https://trec.nist.gov/data/qrels_eng/qrels.distillation.txt.gz")
},
"info_url" : "https://trec.nist.gov/data/t11.web.html",
}
TREC_WT_2003_FILES = {
"topics" :
{
"np" : ("webtopics_551-600.txt.gz", "https://trec.nist.gov/data/topics_eng/webtopics_551-600.txt.gz", "trec"),
"td" : ("2003.distillation_topics.1-50.txt", "https://trec.nist.gov/data/topics_eng/2003.distillation_topics.1-50.txt", "trec"),
},
"qrels" :
{
"np" : ("qrels.named-page.txt.gz", "https://trec.nist.gov/data/qrels_eng/qrels.named-page.txt.gz"),
"td" : ("qrels.distillation.2003.txt", "https://trec.nist.gov/data/qrels_eng/qrels.distillation.2003.txt")
},
"info_url" : "https://trec.nist.gov/data/t12.web.html",
}
def irds_mirror(md5):
return f'http://mirror.ir-datasets.com/{md5}'
def filter_on_qid_type(self, component, variant):
if component == "topics":
data = self.get_topics("all")
elif component == "qrels":
data = self.get_qrels("all")
qid2type_file = self._get_one_file("topics_map")[0]
qid2type = pd.read_csv(qid2type_file, names=["qid", "type"], sep=" ")
qid2type["qid"] = qid2type.apply(lambda row: row["qid"].split("-")[1], axis=1)
rtr = data.merge(qid2type[qid2type["type"] == variant], on=["qid"])
if len(rtr) == 0:
raise ValueError("No such topic type '%s'" % variant)
rtr.drop(columns=['type'], inplace=True)
return (rtr, "direct")
TREC_WT_2004_FILES = {
"topics" :
{
"all" : remove_prefix,
"np": filter_on_qid_type,
"hp": filter_on_qid_type,
"td": filter_on_qid_type,
},
"topics_map" : [("04.topic-map.official.txt", [
"https://trec.nist.gov/data/web/04.topic-map.official.txt",
irds_mirror("79737768b3be1aa07b14691aa54802c5"),
"https://www.dcs.gla.ac.uk/~craigm/04.topic-map.official.txt"
] )],
"topics_prefixed" : {
"all" : ("Web2004.query.stream.trecformat.txt", [
"https://trec.nist.gov/data/web/Web2004.query.stream.trecformat.txt",
irds_mirror("10821f7a000b8bec058097ede39570be"),
"https://www.dcs.gla.ac.uk/~craigm/Web2004.query.stream.trecformat.txt"],
"trec")
},
"qrels" :
{
"hp" : filter_on_qid_type,
"td" : filter_on_qid_type,
"np" : filter_on_qid_type,
"all" : ("04.qrels.web.mixed.txt", [
"https://trec.nist.gov/data/web/04.qrels.web.mixed.txt",
irds_mirror("93daa0e4b4190c84e30d2cce78a0f674"),
"https://www.dcs.gla.ac.uk/~craigm/04.qrels.web.mixed.txt"])
},
"info_url" : "https://trec.nist.gov/data/t13.web.html",
}
FIFTY_PCT_INDEX_BASE = "http://www.dcs.gla.ac.uk/~craigm/IR_HM/"
FIFTY_PCT_FILES = {
"index": {
"ex2" : [(filename, FIFTY_PCT_INDEX_BASE + "index/" + filename) for filename in ["data.meta-0.fsomapfile"] + STANDARD_TERRIER_INDEX_FILES],
"ex3" : [(filename, FIFTY_PCT_INDEX_BASE + "ex3/" + filename) for filename in ["data.meta-0.fsomapfile", "data-pagerank.oos"] + STANDARD_TERRIER_INDEX_FILES],
},
"topics": {
"training" : ("training.topics", FIFTY_PCT_INDEX_BASE + "topics/" + "training.topics", "trec"),
"validation" : ("validation.topics", FIFTY_PCT_INDEX_BASE + "topics/" + "validation.topics", "trec"),
},
"qrels": {
"training" : ("training.qrels", FIFTY_PCT_INDEX_BASE + "topics/" + "training.qrels", "trec"),
"validation" : ("validation.qrels", FIFTY_PCT_INDEX_BASE + "topics/" + "validation.qrels", "trec"),
}
}
# a function for the TREC Web track 2009 qrels, to make prels into qrels
def prel2qrel(self, component, variant):
prel_file, _ = self._get_one_file("prels", variant)
df = pd.read_csv(prel_file, sep=" ", names=["qid", "docno", "label", "oth1", "oth2"])[["qid", "docno", "label"]]
df["qid"] = df["qid"].astype(str)
df["docno"] = df["docno"].astype(str)
return (df, "direct")
TREC_WT_2009_FILES = {
"topics" : [
remove_prefix
],
"topics_prefixed" : [
("wt09.topics.queries-only", "https://trec.nist.gov/data/web/09/wt09.topics.queries-only", "singleline")
],
"qrels" : {
"adhoc" : prel2qrel,
"adhoc.catA" : prel2qrel,
"adhoc.catB" : prel2qrel,
},
"prels" : {
"adhoc" : ("prels.1-50.gz", "https://trec.nist.gov/data/web/09/prels.1-50.gz"),
"adhoc.catA" : ("prels.catA.1-50.gz", "https://trec.nist.gov/data/web/09/prels.catA.1-50.gz"),
"adhoc.catB" : ("prels.catB.1-50.gz", "https://trec.nist.gov/data/web/09/prels.catB.1-50.gz")
},
"info_url" : "https://trec.nist.gov/data/web09.html",
}
TREC_WT_2010_FILES = {
"topics" : [
("wt2010-topics.queries-only", "https://trec.nist.gov/data/web/10/wt2010-topics.queries-only", "singleline")
],
"qrels" :
{
"adhoc" : ("qrels.adhoc", "https://trec.nist.gov/data/web/10/10.adhoc-qrels.final")
},
"info_url" : "https://trec.nist.gov/data/web10.html",
}
TREC_WT_2011_FILES = {
"topics" : [
("queries.101-150.txt", "https://trec.nist.gov/data/web/11/queries.101-150.txt", "singleline")
],
"qrels" :
{
"adhoc" : ("qrels.adhoc", "https://trec.nist.gov/data/web/11/qrels.adhoc")
},
"info_url" : "https://trec.nist.gov/data/web2011.html",
}
TREC_WT_2012_FILES = {
"topics" : [
("queries.151-200.txt", "https://trec.nist.gov/data/web/12/queries.151-200.txt", "singleline")
],
"qrels" :
{
"adhoc" : ("qrels.adhoc", "https://trec.nist.gov/data/web/12/qrels.adhoc")
},
"info_url" : "https://trec.nist.gov/data/web2012.html",
}
TREC_WT2G_FILES = {
"qrels" : [ ("qrels.trec8.small_web.gz", "https://trec.nist.gov/data/qrels_eng/qrels.trec8.small_web.gz") ],
"topics" : [ ( "topics.401-450.gz", "https://trec.nist.gov/data/topics_eng/topics.401-450.gz" ) ],
"info_url" : "https://trec.nist.gov/data/t8.web.html",
}
TREC_WT10G_FILES = {
"qrels" : {
"trec9" : ("qrels.trec9.main_web.gz", "https://trec.nist.gov/data/qrels_eng/qrels.trec9.main_web.gz"),
"trec10-adhoc" : ("qrels.trec10.main_web.gz", "https://trec.nist.gov/data/qrels_eng/qrels.trec10.main_web.gz"),
"trec10-hp" : ("qrels.trec10.entrypage.gz", "https://trec.nist.gov/data/qrels_eng/qrels.trec10.entrypage.gz"),
},
"topics" : {
"trec9" : ( "topics.451-500.gz", "https://trec.nist.gov/data/topics_eng/topics.451-500.gz" ),
"trec10-adhoc" : ( "topics.501-550.txt", "https://trec.nist.gov/data/topics_eng/topics.501-550.txt" ),
"trec10-hp" : parse_desc_only
},
"topics_desc_only" : {
"trec10-hp" : ( "entry_page_topics.1-145.txt", "https://trec.nist.gov/data/topics_eng/entry_page_topics.1-145.txt" ),
},
"info_url" : "https://trec.nist.gov/data/t9.web.html",
}
def _merge_years(self, component, variant):
MAP_METHOD = {
"topics" : RemoteDataset.get_topics,
"qrels" : RemoteDataset.get_qrels,
}
dfs = []
low, hi = variant.split("-")
for y in range(int(low), int(hi)+1):
df = MAP_METHOD[component](self, variant=str(y))
dfs.append(df)
return (pd.concat(dfs), "direct")
TREC_TB_FILES = {
"topics" : {
"2004" : ( "04topics.701-750.txt", "https://trec.nist.gov/data/terabyte/04/04topics.701-750.txt" ),
"2005" : ( "04topics.701-750.txt", "https://trec.nist.gov/data/terabyte/05/05.topics.751-800.txt" ),
"2006" : ( "06.topics.801-850.txt", "https://trec.nist.gov/data/terabyte/06/06.topics.801-850.txt" ),
"2004-2006" : ("06.topics.701-850.txt", "https://trec.nist.gov/data/terabyte/06/06.topics.701-850.txt"),
"2006-np" : ( "06.np_topics.901-1081.txt", "https://trec.nist.gov/data/terabyte/06/06.np_topics.901-1081.txt" ),
"2005-np" : ( "05.np_topics.601-872.final.txt", "https://trec.nist.gov/data/terabyte/05/05.np_topics.601-872.final.txt")
},
"qrels" : {
"2004" : ( "04.qrels.12-Nov-04", "https://trec.nist.gov/data/terabyte/04/04.qrels.12-Nov-04"),
"2005" : ( "05.adhoc_qrels", "https://trec.nist.gov/data/terabyte/05/05.adhoc_qrels"),
"2006" : ( "qrels.tb06.top50", "https://trec.nist.gov/data/terabyte/06/qrels.tb06.top50"),
"2004-2006" : _merge_years,
"2005-np" : ( "05.np_qrels", "https://trec.nist.gov/data/terabyte/05/05.np_qrels"),
"2006-np" : ( "qrels.tb06.np", "https://trec.nist.gov/data/terabyte/06/qrels.tb06.np"),
},
"info_url" : "https://trec.nist.gov/data/terabyte.html"
}
TREC_ROBUST_04_FILES = {
"qrels" : [ ("qrels.robust2004.txt", "https://trec.nist.gov/data/robust/qrels.robust2004.txt") ],
"topics" : [ ( "04.testset.gz", "https://trec.nist.gov/data/robust/04.testset.gz" ) ],
"info_url" : "https://trec.nist.gov/data/t13_robust.html",
}
TREC_ROBUST_05_FILES = {
"qrels" : [ ("TREC2005.qrels.txt", "https://trec.nist.gov/data/robust/05/TREC2005.qrels.txt") ],
"topics" : [ ( "05.50.topics.txt", "https://trec.nist.gov/data/robust/05/05.50.topics.txt" ) ],
"info_url" : "https://trec.nist.gov/data/t14_robust.html",
}
TREC_PRECISION_MEDICINE_FILES = {
"topics" : {
"2017" : ("topics2017.xml", "http://www.trec-cds.org/topics2017.xml", "trecxml"),
"2018" : ("topics2018.xml", "http://www.trec-cds.org/topics2018.xml", "trecxml"),
"2019" : ("topics2019.xml", "http://www.trec-cds.org/topics2019.xml", "trecxml"),
"2020" : ("topics2020.xml", "http://www.trec-cds.org/topics2020.xml", "trecxml")
},
"qrels" : {
"qrels-2017-abstracts" : ("qrels-2017-abstracts.txt", "https://trec.nist.gov/data/precmed/qrels-final-abstracts.txt"), #TODO keep original names?
"qrels-2017-abstracts-sample" : ("qrels-2017-abstracts-sample.txt", "https://trec.nist.gov/data/precmed/sample-qrels-final-abstracts.txt"),
"qrels-2017-trials" : ("qrels-2017-trials.txt", "https://trec.nist.gov/data/precmed/qrels-final-trials.txt"),
"qrels-2018-abstracts" : ("qrels-2018-abstracts.txt", "https://trec.nist.gov/data/precmed/qrels-treceval-abstracts-2018-v2.txt"),
"qrels-2018-abstracts-sample" : ("qrels-2018-abstracts-sample.txt", "https://trec.nist.gov/data/precmed/qrels-sample-abstracts-v2.txt"),
"qrels-2018-trials" : ("qrels-2018-trials.txt", "https://trec.nist.gov/data/precmed/qrels-treceval-clinical_trials-2018-v2.txt"),
"qrels-2018-trials-sample" : ("qrels-2018-trials-sample.txt", "https://trec.nist.gov/data/precmed/qrels-sample-trials-v2.txt"),
"qrels-2019-abstracts" : ("qrels-2019-abstracts.txt", "https://trec.nist.gov/data/precmed/qrels-treceval-abstracts.2019.txt"),
"qrels-2019-trials" : ("qrels-2019-trials.txt", "https://trec.nist.gov/data/precmed/qrels-treceval-trials.38.txt"),
"qrels-2019-abstracts-sample" : ("qrels-2019-abstracts-sample.txt", "https://trec.nist.gov/data/precmed/qrels-sampleval-abstracts.2019.txt"),
"qrels-2019-trials-sample" : ("qrels-2019-trials-sample.txt", "https://trec.nist.gov/data/precmed/qrels-sampleval-trials.38.txt")
},
"info_url" : "https://trec.nist.gov/data/precmed.html",
}
VASWANI_CORPUS_BASE = "https://raw.githubusercontent.com/terrier-org/pyterrier/master/tests/fixtures/vaswani_npl/"
VASWANI_INDEX_BASE = "https://raw.githubusercontent.com/terrier-org/pyterrier/master/tests/fixtures/index/"
VASWANI_FILES = {
"corpus": [("doc-text.trec", [
VASWANI_CORPUS_BASE + "corpus/doc-text.trec",
irds_mirror("a059e713c50350e39999467c8c73b7c5")])],
"topics": [("query-text.trec", [
VASWANI_CORPUS_BASE + "query-text.trec",
irds_mirror("3a624be2b0ef7c9534cf848891679bec")])],
"qrels": [("qrels", [
VASWANI_CORPUS_BASE + "qrels",
irds_mirror("6acb6db9969da8b8c6c23c09551af8d9")])],
"index": _datarepo_index_default_none,
#"index":
# [(filename, VASWANI_INDEX_BASE + filename) for filename in STANDARD_TERRIER_INDEX_FILES + ["data.meta-0.fsomapfile"]],
"info_url" : "http://ir.dcs.gla.ac.uk/resources/test_collections/npl/",
"corpus_iter" : lambda dataset, **kwargs : pt.index.treccollection2textgen(dataset.get_corpus(), num_docs=11429, verbose=kwargs.get("verbose", False))
}
DATASET_MAP : Dict[str, Dataset] = {
# used for UGlasgow teaching
"50pct" : RemoteDataset("50pct", FIFTY_PCT_FILES),
# umass antique corpus - see http://ciir.cs.umass.edu/downloads/Antique/
"antique" : RemoteDataset("antique", ANTIQUE_FILES),
# generated from http://ir.dcs.gla.ac.uk/resources/test_collections/npl/
"vaswani": RemoteDataset("vaswani", VASWANI_FILES),
"msmarco_document" : RemoteDataset("msmarco_document", MSMARCO_DOC_FILES),
"msmarcov2_document" : RemoteDataset("msmarcov2_document", MSMARCOv2_DOC_FILES),
"msmarco_passage" : RemoteDataset("msmarco_passage", MSMARCO_PASSAGE_FILES),
"msmarcov2_passage" : RemoteDataset("msmarcov2_passage", MSMARCOv2_PASSAGE_FILES),
"trec-robust-2004" : RemoteDataset("trec-robust-2004", TREC_ROBUST_04_FILES),
"trec-robust-2005" : RemoteDataset("trec-robust-2005", TREC_ROBUST_05_FILES),
"trec-terabyte" : RemoteDataset("trec-terabyte", TREC_TB_FILES),
#medical-like tracks
"trec-precision-medicine" : RemoteDataset("trec-precicion-medicine", TREC_PRECISION_MEDICINE_FILES),
"trec-covid" : RemoteDataset("trec-covid", TREC_COVID_FILES),
#wt2g
"trec-wt2g" : RemoteDataset("trec-wt2g", TREC_WT2G_FILES),
#wt10g
"trec-wt10g" : RemoteDataset("trec-wt10g", TREC_WT10G_FILES),
#.gov
"trec-wt-2002" : RemoteDataset("trec-wt-2002", TREC_WT_2002_FILES),
"trec-wt-2003" : RemoteDataset("trec-wt-2003", TREC_WT_2002_FILES),
"trec-wt-2004" : RemoteDataset("trec-wt-2004", TREC_WT_2004_FILES),
#clueweb09
"trec-wt-2009" : RemoteDataset("trec-wt-2009", TREC_WT_2009_FILES),
"trec-wt-2010" : RemoteDataset("trec-wt-2010", TREC_WT_2010_FILES),
"trec-wt-2011" : RemoteDataset("trec-wt-2011", TREC_WT_2011_FILES),
"trec-wt-2012" : RemoteDataset("trec-wt-2012", TREC_WT_2012_FILES),
}
# Include all datasets from ir_datasets with "irds:" prefix so they don't conflict with pt dataset names
# Results in records like:
# irds:antique
# irds:antique/test
# irds:antique/test/non-offensive
# irds:antique/train
# ...
for ds_id in ir_datasets.registry:
DATASET_MAP[f'irds:{ds_id}'] = IRDSDataset(ds_id, defer_load=True)
# "trec-deep-learning-docs"
#DATASET_MAP['msmarco_document'] = DATASET_MAP["trec-deep-learning-docs"]
#DATASET_MAP['msmarco_passage'] = DATASET_MAP["trec-deep-learning-passages"]
DATASET_MAP["trec-deep-learning-docs"] = DATASET_MAP['msmarco_document']
DATASET_MAP["trec-deep-learning-passages"] = DATASET_MAP['msmarco_passage']
[docs]
def get_dataset(name, **kwargs):
"""
Get a dataset by name
"""
# Some datasets in ir_datasets are built on-the-fly (e.g., clirmatrix).
# Handle this by allocating it on demand here.
if name not in DATASET_MAP and name.startswith('irds:'):
# remove irds: prefix
ds_id = name[len('irds:'):]
DATASET_MAP[name] = IRDSDataset(ds_id)
rtr = DATASET_MAP[name]
rtr._configure(**kwargs)
return rtr
def datasets():
"""
Lists all the names of the datasets
"""
return DATASET_MAP.keys()
[docs]
def find_datasets(query, en_only=True):
"""
A grep-like method to help identify datasets. Filters the output of list_datasets() based on the name containing the query
"""
datasets = list_datasets(en_only=en_only)
return datasets[datasets['dataset'].str.contains(query)]
[docs]
def list_datasets(en_only=True):
"""
Returns a dataframe of all datasets, listing which topics, qrels, corpus files or indices are available.
By default, filters to only datasets with both a corpus and topics in English.
"""
import pandas as pd
import os
# we should supress any IRDS warning about deprecated datasets
restore_env = os.environ.get("IR_DATASETS_SKIP_DEPRECATED_WARNING", None)
try:
os.environ['IR_DATASETS_SKIP_DEPRECATED_WARNING'] = 'true'
rows=[]
for k in datasets():
dataset = get_dataset(k)
rows.append([
k,
dataset._describe_component("topics"),
dataset.get_topics_lang(),
dataset._describe_component("qrels"),
dataset._describe_component("corpus"),
dataset.get_corpus_lang(),
dataset._describe_component("index"),
dataset.info_url() ])
finally:
if restore_env is None:
del os.environ['IR_DATASETS_SKIP_DEPRECATED_WARNING']
else:
os.environ['IR_DATASETS_SKIP_DEPRECATED_WARNING'] = restore_env
result = pd.DataFrame(rows, columns=["dataset", "topics", "topics_lang", "qrels", "corpus", "corpus_lang", "index", "info_url"])
if en_only:
topics_filter = (result['topics'].isnull()) | (result['topics_lang'] == 'en')
corpus_filter = (result['corpus'].isnull()) | (result['corpus_lang'] == 'en')
result = result[topics_filter & corpus_filter]
return result
def transformer_from_dataset(
dataset : Union[str, Dataset],
clz,
variant: Optional[str] = None,
version: str = 'latest',
**kwargs) -> pt.Transformer:
"""Returns a Transformer instance of type ``clz`` for the provided index of variant ``variant``."""
if isinstance(dataset, str):
dataset = get_dataset(dataset)
if version != "latest":
raise ValueError("index versioning not yet supported")
if hasattr(dataset, 'get_index'):
indexref = dataset.get_index(variant)
else:
raise ValueError('dataset doe not support get_index()')
classname = clz.__name__
classnames = [classname]
if classname == 'Retriever':
# we need to look for BatchRetrieve.args.json for legacy support
classnames.append('BatchRetrieve')
for c in classnames:
# now look for, e.g., BatchRetrieve.args.json file, which will define the args for Retriever, e.g. stemming
indexdir = indexref #os.path.dirname(indexref.toString())
argsfile = os.path.join(indexdir, classname + ".args.json")
if os.path.exists(argsfile):
with pt.io.autoopen(argsfile, "rt") as f:
args = json.load(f)
# anything specified in kwargs of this methods overrides the .args.json file
args.update(kwargs)
kwargs = args
return clz(indexref, **kwargs)