import urllib.request
import wget
import os
import pandas as pd
from .transformer import is_lambda
import types
from typing import Union, Tuple, Iterator, Dict, Any, List
from warnings import warn
import requests
from .io import autoopen, touch
from . import tqdm, HOME_DIR
import tarfile
from warnings import warn
import pyterrier
TERRIER_DATA_BASE="http://data.terrier.org/indices/"
STANDARD_TERRIER_INDEX_FILES = [
"data.direct.bf",
"data.document.fsarrayfile",
"data.inverted.bf",
"data.lexicon.fsomapfile",
"data.lexicon.fsomaphash",
"data.lexicon.fsomapid",
"data.meta.idx",
"data.meta.zdata",
"data.properties"
]
class GeneratorLen(object):
def __init__(self, gen, length):
self.gen = gen
self.length = length
def __len__(self):
return self.length
def __iter__(self):
return self.gen
[docs]class Dataset():
"""
Represents a dataset (test collection) for indexing or retrieval. A common use-case is to use the Dataset within an Experiment::
dataset = pt.get_dataset("trec-robust-2004")
pt.Experiment([br1, br2], dataset.get_topics(), dataset.get_qrels(), eval_metrics=["map", "recip_rank"])
"""
def _configure(self, **kwargs):
pass
[docs] def get_corpus(self):
"""
Returns the location of the files to allow indexing the corpus, i.e. it returns a list of filenames.
"""
pass
[docs] def get_corpus_iter(self, verbose=True) -> Iterator[Dict[str,Any]]:
"""
Returns an iter of dicts for this collection. If verbose=True, a tqdm pbar shows the progress over this iterator.
"""
pass
[docs] def get_corpus_lang(self) -> Union[str,None]:
"""
Returns the ISO 639-1 language code for the corpus, or None for multiple/other/unknown
"""
return None
[docs] def get_index(self, variant=None, **kwargs):
"""
Returns the IndexRef of the index to allow retrieval. Only a few datasets provide indices ready made.
"""
pass
[docs] def get_topics(self, variant=None) -> pd.DataFrame:
"""
Returns the topics, as a dataframe, ready for retrieval.
"""
pass
[docs] def get_topics_lang(self) -> Union[str,None]:
"""
Returns the ISO 639-1 language code for the topics, or None for multiple/other/unknown
"""
return None
[docs] def get_qrels(self, variant=None) -> pd.DataFrame:
"""
Returns the qrels, as a dataframe, ready for evaluation.
"""
pass
[docs] def get_topicsqrels(self, variant=None) -> Tuple[pd.DataFrame,pd.DataFrame]:
"""
Returns both the topics and qrels in a tuple. This is useful for pt.Experiment().
"""
return (
self.get_topics(variant=variant),
self.get_qrels(variant=variant)
)
[docs] def info_url(self):
"""
Returns a url that provides more information about this dataset.
"""
return None
[docs] def get_results(self, variant=None) -> pd.DataFrame:
"""
Returns a standard result set provided by the dataset. This is useful for re-ranking experiments.
"""
pass
class RemoteDataset(Dataset):
def __init__(self, name, locations):
self.locations = locations
self.name = name
self.user = None
self.password = None
def _configure(self, **kwargs):
from os.path import expanduser
pt_home = HOME_DIR
if pt_home is None:
from os.path import expanduser
userhome = expanduser("~")
pt_home = os.path.join(userhome, ".pyterrier")
self.corpus_home = os.path.join(pt_home, "corpora", self.name)
if 'user' in kwargs:
self.user = kwargs['user']
self.password = kwargs['password']
@staticmethod
def download(URLs : Union[str,List[str]], filename : str, **kwargs):
import pyterrier as pt
basename = os.path.basename(filename)
if isinstance(URLs, str):
URLs = [URLs]
finalattempt=len(URLs)-1
error = None
for i, url in enumerate(URLs):
try:
r = requests.get(url, allow_redirects=True, stream=True, **kwargs)
r.raise_for_status()
total = int(r.headers.get('content-length', 0))
with pt.io.finalized_open(filename, 'b') as file, tqdm(
desc=basename,
total=total,
unit='iB',
unit_scale=True,
unit_divisor=1024,
) as bar:
for data in r.iter_content(chunk_size=1024):
size = file.write(data)
bar.update(size)
break
except Exception as e:
if error is not None:
e.__cause__ = error # chain errors to show all if fails
error = e
if i == finalattempt:
raise error
else:
warn("Problem fetching %s, resorting to next mirror" % url)
def _check_variant(self, component, variant=None):
name=self.name
if not component in self.locations:
raise ValueError("No %s in dataset %s" % (component, name))
if variant is None:
if not isinstance(self.locations[component], list):
raise ValueError("For %s in dataset %s, you must specify a variant. Available are: %s" % (component, name, str(list(self.locations[component].keys()))))
else:
if isinstance(self.locations[component], list):
raise ValueError("For %s in dataset %s, there are no variants, but you specified %s" % (component, name, variant))
if not variant in self.locations[component]:
raise ValueError("For %s in dataset %s, there is no variant %s. Available are: %s" % (component, name, variant, str(list(self.locations[component].keys()))))
def _get_one_file(self, component, variant=None):
filetype=None
name=self.name
self._check_variant(component, variant)
location = self.locations[component][0] if variant is None else self.locations[component][variant]
if is_lambda(location) or isinstance(location, types.FunctionType):
argcount = location.__code__.co_argcount
if argcount == 0:
return location()
elif argcount == 3:
return location(self, component, variant)
else:
raise TypeError("Expected function with 0 or 3 arguments for %s %s %s" % (component, name, variant))
local = location[0]
URL = location[1]
if len(location) > 2:
filetype = location[2]
if not os.path.exists(self.corpus_home):
os.makedirs(self.corpus_home)
local = os.path.join(self.corpus_home, local)
actualURL = URL if isinstance(URL, str) else URL[0]
if "#" in actualURL and not os.path.exists(local):
tarname, intarfile = actualURL.split("#")
assert not "/" in intarfile
assert ".tar" in tarname or ".tgz" in tarname
localtarfile, _ = self._get_one_file("tars", tarname)
tarobj = tarfile.open(localtarfile, "r")
tarobj.extract(intarfile, path=self.corpus_home)
os.rename(os.path.join(self.corpus_home, intarfile), local)
return (local, filetype)
if not os.path.exists(local):
try:
print("Downloading %s %s to %s" % (self.name, component, local))
kwargs = {}
if self.user is not None:
kwargs["auth"]=(self.user, self.password)
RemoteDataset.download(URL, local, **kwargs)
except urllib.error.HTTPError as he:
raise ValueError("Could not fetch " + URL) from he
return (local, filetype)
def _get_all_files(self, component, variant=None, **kwargs):
if variant is None:
localDir = os.path.join(self.corpus_home, component)
else:
localDir = os.path.join(self.corpus_home, component, variant)
kwargs = {}
if self.user is not None:
kwargs["auth"]=(self.user, self.password)
direxists = os.path.exists(localDir)
location = self.locations[component]
if is_lambda(location) or isinstance(location, types.FunctionType):
# functions are expensive to call, normally another HTTP is needed.
# just assume we have everthing we need if we have the local directory already
# and it contains a .complete file.
if direxists and os.path.exists(os.path.join(localDir, ".complete")):
return localDir
# call the function, and get the file list
file_list = location(self, component, variant, **kwargs)
else:
file_list = self.locations[component] if variant is None else self.locations[component][variant]
if not direxists:
os.makedirs(localDir)
print("Downloading %s %s to %s" % (self.name, component, localDir))
# check for how much space is required and available space
def _totalsize(file_list):
total = -1
for f in file_list:
if len(f) > 2:
total += f[2]
if total != -1:
total += 1
return total
totalsize = _totalsize(file_list)
if totalsize > 0:
import shutil
total, used, free = shutil.disk_usage(localDir)
if free < totalsize:
raise ValueError("Insufficient freedisk space at %s to download index" % localDir)
if totalsize > 2 * 2**30:
warn("Downloading index of > 2GB.")
for fileentry in file_list:
local = fileentry[0]
URL = fileentry[1]
expectedlength = -1
if len(fileentry) == 3:
expectedlength = fileentry[2]
local = os.path.join(localDir, local)
# if file exists and we know length, check if dowload is complete
fileexists = os.path.exists(local)
if fileexists and expectedlength >= 0:
length = os.stat(local).st_size
if expectedlength != length:
warn("Removing partial download of %s (expected %d bytes, found %d)" % (local, expectedlength, length ))
os.remove(local)
fileexists = False
if not fileexists:
if "#" in URL:
tarname, intarfile = URL.split("#")
assert not "/" in intarfile
assert ".tar" in tarname or ".tgz" in tarname
localtarfile, _ = self._get_one_file("tars", tarname)
tarobj = tarfile.open(localtarfile, "r")
tarobj.extract(intarfile, path=self.corpus_home)
local = os.path.join(self.corpus_home, local)
#TODO, files could be recompressed here to save space
os.rename(os.path.join(self.corpus_home, intarfile), local)
else:
try:
RemoteDataset.download(URL, local, **kwargs)
except urllib.error.HTTPError as he:
raise ValueError("Could not fetch " + URL) from he
# verify file if exists
if expectedlength >= 0:
length = os.stat(local).st_size
if expectedlength != length:
raise ValueError("Failed download of %s to %s (expected %d bytes, found %d)" % (URL, local, expectedlength, length ))
# finally, touch a file signifying that download has been completed
touch(os.path.join(localDir, ".complete"))
return localDir
def _describe_component(self, component):
if component not in self.locations:
return None
if type(self.locations[component]) == type([]):
return True
if isinstance(self.locations[component], dict):
return list(self.locations[component].keys())
return True
def get_corpus(self, **kwargs):
import pyterrier as pt
return list(filter(lambda f : not f.endswith(".complete"), pt.io.find_files(self._get_all_files("corpus", **kwargs))))
def get_corpus_iter(self, **kwargs):
if not "corpus_iter" in self.locations:
raise ValueError("Cannot supply a corpus iterator on dataset %s" % self.name)
return self.locations["corpus_iter"](self, **kwargs)
def get_corpus_lang(self):
if 'corpus' in self.locations:
return 'en' # all are english
return None
def get_qrels(self, variant=None):
import pyterrier as pt
filename, type = self._get_one_file("qrels", variant)
if type == "direct":
return filename
return pt.io.read_qrels(filename)
def get_topics(self, variant=None, **kwargs):
import pyterrier as pt
file, filetype = self._get_one_file("topics", variant)
if filetype is None or filetype in pt.io.SUPPORTED_TOPICS_FORMATS:
return pt.io.read_topics(file, format=filetype, **kwargs)
elif filetype == "direct":
return file
raise ValueError("Unknown filetype %s for %s topics %s" % (filetype, self.name, variant))
def get_topics_lang(self):
if 'topics' in self.locations:
return 'en' # all are english
return None
def get_index(self, variant=None, **kwargs):
import pyterrier as pt
if self.name == "50pct" and variant is None:
variant="ex1"
thedir = self._get_all_files("index", variant=variant, **kwargs)
return thedir
#return pt.autoclass("org.terrier.querying.IndexRef").of(os.path.join(thedir, "data.properties"))
def __repr__(self):
return "RemoteDataset for %s, with %s" % (self.name, str(list(self.locations.keys())))
def info_url(self):
return self.locations['info_url'] if "info_url" in self.locations else None
class IRDSDataset(Dataset):
def __init__(self, irds_id, defer_load=False):
self._irds_id = irds_id
self._irds_ref = None if defer_load else ir_datasets.load(self._irds_id)
def irds_ref(self):
if self._irds_ref is None:
self._irds_ref = ir_datasets.load(self._irds_id)
return self._irds_ref
def get_corpus(self):
raise NotImplementedError("IRDSDataset doesn't support get_corpus; use get_corpus_iter instead. If you "
"are indexing, get_corpus_iter should be used in conjunction with IterDictIndexer.")
def get_corpus_iter(self, verbose=True, start=0, count=None):
ds = self.irds_ref()
assert ds.has_docs(), f"{self._irds_id} doesn't support get_corpus_iter"
it = ds.docs_iter()
total = ds.docs_count()
# use slicing if requested
if start > 0 or count is not None:
if count is not None:
it = it[start:start+count]
total = count
else:
it = it[start:]
total -= start
# tqdm support
if verbose:
it = tqdm(it, desc=f'{self._irds_id} documents', total=total)
# rewrite to follow pyterrier std
def gen():
for doc in it:
doc = doc._asdict()
# pyterrier uses "docno"
doc['docno'] = doc.pop('doc_id')
yield doc
# ensure we can provide accurate len
return GeneratorLen(gen(), total)
def get_corpus_lang(self):
ds = self.irds_ref()
if ds.has_docs():
return ds.docs_lang()
return None
def get_index(self, variant=None):
# this is only for indices where Terrier provides an index already
raise NotImplementedError("IRDSDataset doesn't support get_index")
def get_topics(self, variant=None, tokenise_query=True):
"""
Returns the topics, as a dataframe, ready for retrieval.
"""
ds = self.irds_ref()
assert ds.has_queries(), f"{self._irds_id} doesn't support get_topics"
qcls = ds.queries_cls()
assert variant is None or variant in qcls._fields[1:], f"{self._irds_id} only supports the following topic variants {qcls._fields[1:]}"
df = pd.DataFrame(ds.queries_iter())
df.rename(columns={"query_id": "qid"}, inplace=True) # pyterrier uses "qid"
if variant is not None:
# Some datasets have a query field called "query". We need to remove it or
# we'll end up with multiple "query" columns, which will cause problems
# because many components are written assuming no columns have the same name.
if variant != 'query' and 'query' in df.columns:
df.drop(columns=['query'], axis=1, inplace=True)
df.rename(columns={variant: "query"}, inplace=True) # user specified which version of the query they want
df.drop(columns=df.columns.difference(['qid','query']), axis=1, inplace=True)
elif len(qcls._fields) == 2:
# auto-rename single query field to "query" if there's only query_id and that field
df.rename(columns={qcls._fields[1]: "query"}, inplace=True)
else:
print(f'There are multiple query fields available: {qcls._fields[1:]}. To use with pyterrier, provide variant or modify dataframe to add query column.')
# apply pyterrier tokenisation (otherwise the queries may not play well with batchretrieve)
if tokenise_query and 'query' in df:
import pyterrier as pt
tokeniser = pt.autoclass("org.terrier.indexing.tokenisation.Tokeniser").getTokeniser()
def pt_tokenise(text):
return ' '.join(tokeniser.getTokens(text))
df['query'] = df['query'].apply(pt_tokenise)
return df
def get_topics_lang(self):
ds = self.irds_ref()
if ds.has_queries():
return ds.queries_lang()
return None
def get_qrels(self, variant=None):
"""
Returns the qrels, as a dataframe, ready for evaluation.
"""
ds = self.irds_ref()
assert ds.has_qrels(), f"{self._irds_id} doesn't support get_qrels"
qrelcls = ds.qrels_cls()
qrel_fields = [f for f in qrelcls._fields if f not in ('query_id', 'doc_id', 'iteration')]
assert variant is None or variant in qrel_fields, f"{self._irds_id} only supports the following qrel variants {qrel_fields}"
df = pd.DataFrame(ds.qrels_iter())
# pyterrier uses "qid" and "docno"
df.rename(columns={
"query_id": "qid",
"doc_id": "docno"}, inplace=True)
# pyterrier uses "label"
if variant is not None:
df.rename(columns={variant: "label"}, inplace=True)
if len(qrel_fields) == 1:
# usually "relevance"
df.rename(columns={qrel_fields[0]: "label"}, inplace=True)
elif 'relevance' in qrel_fields:
print(f'There are multiple qrel fields available: {qrel_fields}. Defaulting to "relevance", but to use a different one, supply variant')
df.rename(columns={'relevance': "label"}, inplace=True)
else:
print(f'There are multiple qrel fields available: {qrel_fields}. To use with pyterrier, provide variant or modify dataframe to add query column.')
return df
def get_results(self, variant=None) -> pd.DataFrame:
"""
Returns a standard result set provided by the dataset. This is useful for re-ranking experiments.
"""
ds = self.irds_ref()
assert ds.has_scoreddocs(), f"{self._irds_id} doesn't support get_reranking_run"
result = pd.DataFrame(ds.scoreddocs)
result = result.rename(columns={'query_id': 'qid', 'doc_id': 'docno'}) # convert irds field names to pyterrier names
result.sort_values(by=['qid', 'score', 'docno'], ascending=[True, False, True], inplace=True) # ensure data is sorted by qid, -score, did
# result doesn't yet contain queries (only qids) so load and merge them in
topics = self.get_topics(variant)
result = pd.merge(result, topics, how='left', on='qid', copy=False)
return result
def _describe_component(self, component):
ds = self.irds_ref()
if component == "topics":
if ds.has_queries():
fields = ds.queries_cls()._fields[1:]
if len(fields) > 1:
return list(fields)
return True
return None
if component == "qrels":
if ds.has_qrels():
fields = [f for f in ds.qrels_cls()._fields if f not in ('query_id', 'doc_id', 'iteration')]
if len(fields) > 1:
return list(fields)
return True
return None
if component == "corpus":
return ds.has_docs() or None
if component == "results":
return ds.has_scoreddocs() or None
return None
def info_url(self):
top_id = self._irds_id.split('/', 1)[0]
suffix = f'#{self._irds_id}' if top_id != self._irds_id else ''
return f'https://ir-datasets.com/{top_id}.html{suffix}'
def __repr__(self):
return f"IRDSDataset({repr(self._irds_id)})"
def passage_generate(dataset):
for filename in dataset.get_corpus():
with autoopen(filename, 'rt') as corpusfile:
for l in corpusfile: #for each line
docno, passage = l.split("\t")
yield {'docno' : docno, 'text' : passage}
def _datarepo_index(self, component, variant=None, version='latest', **kwargs):
if variant is None:
raise ValueError(f"Must specify index variant for {self.name}. See http://data.terrier.org/{self.name}.dataset.html")
urlprefix= f"http://data.terrier.org/indices/{self.name}/{variant}/{version}/"
url = urlprefix + "files"
try:
r = requests.get(url, **kwargs)
r.raise_for_status()
file = r.text.splitlines()
except Exception as e:
raise ValueError(f"Could not find index variant {variant} for dataset {self.name} at {url}. See available variants at http://data.terrier.org/{self.name}.dataset.html") from e
rtr = []
import re
for linenum, line in enumerate(file):
# skip comments
if line.startswith("#"):
continue
try:
(length, filename) = re.split(r"\s+", line.strip(), 2)
rtr.append((filename, urlprefix+filename, int(length)))
except Exception as e:
raise ValueError(f"Could not parse {url} line {linenum} '{line}'") from e
return rtr
def _datarepo_index_default_none(self, component, variant=None, version='latest', **kwargs):
"""
For backward compatability with vaswani - use default for variant
"""
if variant is None:
variant = 'terrier_stemmed'
return _datarepo_index(self, component, variant=variant, version=version, **kwargs)
ANTIQUE_FILES = {
"topics" : {
"train" : ("antique-train-queries.txt", "http://ciir.cs.umass.edu/downloads/Antique/antique-train-queries.txt", "singleline"),
"test" : ("antique-test-queries.txt", "http://ciir.cs.umass.edu/downloads/Antique/antique-test-queries.txt", "singleline"),
},
"qrels" : {
"train" : ("antique-train.qrel", "http://ciir.cs.umass.edu/downloads/Antique/antique-train.qrel", "singleline"),
"test" : ("antique-test.qrel", "http://ciir.cs.umass.edu/downloads/Antique/antique-test.qrel", "singleline"),
},
"corpus" :
[("antique-collection.txt", "http://ciir.cs.umass.edu/downloads/Antique/antique-collection.txt")],
"info_url" : "https://ciir.cs.umass.edu/downloads/Antique/readme.txt",
"corpus_iter" : passage_generate
}
TREC_COVID_FILES = {
"topics" : {
"round1" : ("topics-rnd1.xml", "https://ir.nist.gov/covidSubmit/data/topics-rnd1.xml", "trecxml"),
"round2" : ("topics-rnd2.xml", "https://ir.nist.gov/covidSubmit/data/topics-rnd2.xml", "trecxml"),
"round3" : ("topics-rnd3.xml", "https://ir.nist.gov/covidSubmit/data/topics-rnd3.xml", "trecxml"),
"round4" : ("topics-rnd4.xml", "https://ir.nist.gov/covidSubmit/data/topics-rnd4.xml", "trecxml"),
"round5" : ("topics-rnd5.xml", "https://ir.nist.gov/covidSubmit/data/topics-rnd5.xml", "trecxml"),
},
"qrels" : {
"round1" : ("qrels-rnd1.txt", "https://ir.nist.gov/covidSubmit/data/qrels-rnd1.txt"),
"round2" : ("qrels-rnd2.txt", "https://ir.nist.gov/covidSubmit/data/qrels-rnd2.txt"),
"round3" : ("qrels-rnd3.txt", "https://ir.nist.gov/covidSubmit/data/qrels-covid_d3_j2.5-3.txt"),
"round3-cumulative" : ("qrels-rnd3-cumulative.txt", "https://ir.nist.gov/covidSubmit/data/qrels-covid_d3_j0.5-3.txt"),
"round4" : ("qrels-rnd4.txt", "https://ir.nist.gov/covidSubmit/data/qrels-covid_d4_j3.5-4.txt"),
"round4-cumulative" : ("qrels-rnd4-cumulative.txt", "https://ir.nist.gov/covidSubmit/data/qrels-covid_d4_j0.5-4.txt"),
"round5" : ("qrels-covid_d5_j4.5-5.txt", "https://ir.nist.gov/covidSubmit/data/qrels-covid_d5_j4.5-5.txt"),
},
"corpus" : {
"round4": ("round4.tar.gz", "https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/historical_releases/cord-19_2020-06-19.tar.gz"),
"round5": ("round5.tar.gz", "https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/historical_releases/cord-19_2020-07-16.tar.gz"),
},
"docids" : {
"docids-rnd3" : ("docids-rnd3.txt", "https://ir.nist.gov/covidSubmit/data/docids-rnd3.txt"),
"docids-rnd4" : ("docids-rnd4.txt", "https://ir.nist.gov/covidSubmit/data/docids-rnd4.txt"),
"docids-rnd5" : ("docids-rnd5.txt", "https://ir.nist.gov/covidSubmit/data/docids-rnd5.txt")
},
"info_url" : "https://ir.nist.gov/covidSubmit/",
"index": _datarepo_index
}
def msmarco_document_generate(dataset):
for filename in dataset.get_corpus(variant="corpus-tsv"):
with autoopen(filename, 'rt') as corpusfile:
for l in corpusfile: #for each line
docno, url, title, passage = l.split("\t")
yield {'docno' : docno, 'url' : url, 'title' : title, 'text' : passage}
MSMARCO_DOC_FILES = {
"corpus" :
[("msmarco-docs.trec.gz", "https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-docs.trec.gz")],
"corpus-tsv":
[("msmarco-docs.tsv.gz", "https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-docs.tsv.gz")],
"topics" :
{
"train" : ("msmarco-doctrain-queries.tsv.gz", "https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-doctrain-queries.tsv.gz", "singleline"),
"dev" : ("msmarco-docdev-queries.tsv.gz", "https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-docdev-queries.tsv.gz", "singleline"),
"test" : ("msmarco-test2019-queries.tsv.gz", "https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-test2019-queries.tsv.gz", "singleline"),
"test-2020" : ("msmarco-test2020-queries.tsv.gz" , "https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-test2020-queries.tsv.gz", "singleline"),
'leaderboard-2020' : ("docleaderboard-queries.tsv.gz" , "https://msmarco.blob.core.windows.net/msmarcoranking/docleaderboard-queries.tsv.gz", "singleline")
},
"qrels" :
{
"train" : ("msmarco-doctrain-qrels.tsv.gz", "https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-doctrain-qrels.tsv.gz"),
"dev" : ("msmarco-docdev-qrels.tsv.gz", "https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-docdev-qrels.tsv.gz"),
"test" : ("2019qrels-docs.txt", "https://trec.nist.gov/data/deep/2019qrels-docs.txt"),
"test-2020" : ("2020qrels-docs.txt", "https://trec.nist.gov/data/deep/2020qrels-docs.txt")
},
"info_url" : "https://microsoft.github.io/msmarco/",
"corpus_iter" : msmarco_document_generate,
"index" : _datarepo_index
}
MSMARCO_PASSAGE_FILES = {
"corpus" :
[("collection.tsv", "collection.tar.gz#collection.tsv")],
"index": {
"terrier_stemmed" : [(filename, TERRIER_DATA_BASE + "/msmarco_passage/terrier_stemmed/latest/" + filename) for filename in STANDARD_TERRIER_INDEX_FILES],
"terrier_unstemmed" : [(filename, TERRIER_DATA_BASE + "/msmarco_passage/terrier_unstemmed/latest/" + filename) for filename in STANDARD_TERRIER_INDEX_FILES],
"terrier_stemmed_text" : [(filename, TERRIER_DATA_BASE + "/msmarco_passage/terrier_stemmed_text/latest/" + filename) for filename in STANDARD_TERRIER_INDEX_FILES],
"terrier_unstemmed_text" : [(filename, TERRIER_DATA_BASE + "/msmarco_passage/terrier_unstemmed_text/latest/" + filename) for filename in STANDARD_TERRIER_INDEX_FILES],
"terrier_stemmed_deepct" : [(filename, TERRIER_DATA_BASE + "/msmarco_passage/terrier_stemmed_deepct/latest/" + filename) for filename in STANDARD_TERRIER_INDEX_FILES],
"terrier_stemmed_docT5query" : [(filename, TERRIER_DATA_BASE + "/msmarco_passage/terrier_stemmed_docT5query/latest/" + filename) for filename in STANDARD_TERRIER_INDEX_FILES],
},
"topics" :
{
"train" : ("queries.train.tsv", "queries.tar.gz#queries.train.tsv", "singleline"),
"dev" : ("queries.dev.tsv", "queries.tar.gz#queries.dev.tsv", "singleline"),
"dev.small" : ("queries.dev.small.tsv", "collectionandqueries.tar.gz#queries.dev.small.tsv", "singleline"),
"eval" : ("queries.eval.tsv", "queries.tar.gz#queries.eval.tsv", "singleline"),
"eval.small" : ("queries.eval.small.tsv", "collectionandqueries.tar.gz#queries.eval.small.tsv", "singleline"),
"test-2019" : ("msmarco-test2019-queries.tsv.gz", "https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-test2019-queries.tsv.gz", "singleline"),
"test-2020" : ("msmarco-test2020-queries.tsv.gz", "https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-test2020-queries.tsv.gz", "singleline")
},
"tars" : {
"queries.tar.gz" : ("queries.tar.gz", "https://msmarco.blob.core.windows.net/msmarcoranking/queries.tar.gz"),
"collection.tar.gz" : ("collection.tar.gz", "https://msmarco.blob.core.windows.net/msmarcoranking/collection.tar.gz"),
"collectionandqueries.tar.gz" : ("collectionandqueries.tar.gz", "https://msmarco.blob.core.windows.net/msmarcoranking/collectionandqueries.tar.gz")
},
"qrels" :
{
"train" : ("qrels.train.tsv", "https://msmarco.blob.core.windows.net/msmarcoranking/qrels.train.tsv"),
"dev" : ("qrels.dev.tsv", "https://msmarco.blob.core.windows.net/msmarcoranking/qrels.dev.tsv"),
"test-2019" : ("2019qrels-docs.txt", "https://trec.nist.gov/data/deep/2019qrels-pass.txt"),
"test-2020" : ("2020qrels-docs.txt", "https://trec.nist.gov/data/deep/2020qrels-pass.txt"),
"dev.small" : ("qrels.dev.small.tsv", "collectionandqueries.tar.gz#qrels.dev.small.tsv"),
},
"info_url" : "https://microsoft.github.io/MSMARCO-Passage-Ranking/",
"corpus_iter" : passage_generate,
"index" : _datarepo_index
}
MSMARCOv2_DOC_FILES = {
"info_url" : "https://microsoft.github.io/msmarco/TREC-Deep-Learning.html",
"topics" : {
"train" : ("docv2_train_queries.tsv", "https://msmarco.blob.core.windows.net/msmarcoranking/docv2_train_queries.tsv", "singleline"),
"dev1" :("docv2_dev_queries.tsv", "https://msmarco.blob.core.windows.net/msmarcoranking/docv2_dev_queries.tsv", "singleline"),
"dev2" :("docv2_dev2_queries.tsv", "https://msmarco.blob.core.windows.net/msmarcoranking/docv2_dev2_queries.tsv", "singleline"),
"valid1" : ("msmarco-test2019-queries.tsv.gz" , "https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-test2019-queries.tsv.gz", "singleline"),
"valid2" : ("msmarco-test2020-queries.tsv.gz" , "https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-test2020-queries.tsv.gz", "singleline"),
"trec_2021" : ("2021_queries.tsv" , "https://msmarco.blob.core.windows.net/msmarcoranking/2021_queries.tsv", "singleline"),
},
"qrels" : {
"train" : ("docv2_train_qrels.tsv", "https://msmarco.blob.core.windows.net/msmarcoranking/docv2_train_qrels.tsv"),
"dev1" :("docv2_dev_qrels.tsv", "https://msmarco.blob.core.windows.net/msmarcoranking/docv2_dev_qrels.tsv"),
"dev2" :("docv2_dev2_qrels.tsv", "https://msmarco.blob.core.windows.net/msmarcoranking/docv2_dev2_qrels.tsv"),
"valid1" : ("docv2_trec2019_qrels.txt.gz" , "https://msmarco.blob.core.windows.net/msmarcoranking/docv2_trec2019_qrels.txt.gz"),
"valid2" : ("docv2_trec2020_qrels.txt.gz" , "https://msmarco.blob.core.windows.net/msmarcoranking/docv2_trec2020_qrels.txt.gz")
},
"index" : _datarepo_index,
}
MSMARCOv2_PASSAGE_FILES = {
"info_url" : "https://microsoft.github.io/msmarco/TREC-Deep-Learning.html",
"topics" : {
"train" : ("passv2_train_queries.tsv", "https://msmarco.blob.core.windows.net/msmarcoranking/passv2_train_queries.tsv", "singleline"),
"dev1" : ("passv2_dev_queries.tsv", "https://msmarco.blob.core.windows.net/msmarcoranking/passv2_dev_queries.tsv", "singleline"),
"dev2" : ("passv2_dev2_queries.tsv", "https://msmarco.blob.core.windows.net/msmarcoranking/passv2_dev2_queries.tsv", "singleline"),
"trec_2021" : ("2021_queries.tsv" , "https://msmarco.blob.core.windows.net/msmarcoranking/2021_queries.tsv", "singleline"),
},
"qrels" : {
"train" : ("passv2_train_qrels.tsv" "https://msmarco.blob.core.windows.net/msmarcoranking/passv2_train_qrels.tsv"),
"dev1" : ("passv2_dev_qrels.tsv", "https://msmarco.blob.core.windows.net/msmarcoranking/passv2_dev_qrels.tsv"),
"dev2" : ("passv2_dev2_qrels.tsv", "https://msmarco.blob.core.windows.net/msmarcoranking/passv2_dev2_qrels.tsv"),
},
"index" : _datarepo_index,
}
# remove WT- prefix from topics
def remove_prefix(self, component, variant):
import pyterrier as pt
topics_file, type = self._get_one_file("topics_prefixed", variant)
if type in pt.io.SUPPORTED_TOPICS_FORMATS:
topics = pt.io.read_topics(topics_file, type)
else:
raise ValueError("Unknown topic type %s" % type)
topics["qid"] = topics.apply(lambda row: row["qid"].split("-")[1], axis=1)
return (topics, "direct")
# a function to fix the namedpage TREC Web tracks 2001 and 2002
def parse_desc_only(self, component, variant):
import pyterrier as pt
file, type = self._get_one_file("topics_desc_only", variant=variant)
topics = pt.io.read_topics(file, format="trec", whitelist=["DESC"], blacklist=None)
topics["qid"] = topics.apply(lambda row: row["qid"].replace("NP", ""), axis=1)
topics["qid"] = topics.apply(lambda row: row["qid"].replace("EP", ""), axis=1)
return (topics, "direct")
TREC_WT_2002_FILES = {
"topics" :
{
"td" : ("webtopics_551-600.txt.gz", "https://trec.nist.gov/data/topics_eng/webtopics_551-600.txt.gz", "trec"),
"np" : parse_desc_only
},
"topics_desc_only" : {
"np" : ("webnamed_page_topics.1-150.txt.gz", "https://trec.nist.gov/data/topics_eng/webnamed_page_topics.1-150.txt.gz", "trec")
},
"qrels" :
{
"np" : ("qrels.named-page.txt.gz", "https://trec.nist.gov/data/qrels_eng/qrels.named-page.txt.gz"),
"td" : ("qrels.distillation.txt.gz", "https://trec.nist.gov/data/qrels_eng/qrels.distillation.txt.gz")
},
"info_url" : "https://trec.nist.gov/data/t11.web.html",
}
TREC_WT_2003_FILES = {
"topics" :
{
"np" : ("webtopics_551-600.txt.gz", "https://trec.nist.gov/data/topics_eng/webtopics_551-600.txt.gz", "trec"),
"td" : ("2003.distillation_topics.1-50.txt", "https://trec.nist.gov/data/topics_eng/2003.distillation_topics.1-50.txt", "trec"),
},
"qrels" :
{
"np" : ("qrels.named-page.txt.gz", "https://trec.nist.gov/data/qrels_eng/qrels.named-page.txt.gz"),
"td" : ("qrels.distillation.2003.txt", "https://trec.nist.gov/data/qrels_eng/qrels.distillation.2003.txt")
},
"info_url" : "https://trec.nist.gov/data/t12.web.html",
}
def irds_mirror(md5):
return f'http://mirror.ir-datasets.com/{md5}'
def filter_on_qid_type(self, component, variant):
if component == "topics":
data = self.get_topics("all")
elif component == "qrels":
data = self.get_qrels("all")
qid2type_file = self._get_one_file("topics_map")[0]
qid2type = pd.read_csv(qid2type_file, names=["qid", "type"], sep=" ")
qid2type["qid"] = qid2type.apply(lambda row: row["qid"].split("-")[1], axis=1)
rtr = data.merge(qid2type[qid2type["type"] == variant], on=["qid"])
if len(rtr) == 0:
raise ValueError("No such topic type '%s'" % variant)
rtr.drop(columns=['type'], inplace=True)
return (rtr, "direct")
TREC_WT_2004_FILES = {
"topics" :
{
"all" : remove_prefix,
"np": filter_on_qid_type,
"hp": filter_on_qid_type,
"td": filter_on_qid_type,
},
"topics_map" : [("04.topic-map.official.txt", [
"https://trec.nist.gov/data/web/04.topic-map.official.txt",
irds_mirror("79737768b3be1aa07b14691aa54802c5"),
"https://www.dcs.gla.ac.uk/~craigm/04.topic-map.official.txt"
] )],
"topics_prefixed" : {
"all" : ("Web2004.query.stream.trecformat.txt", [
"https://trec.nist.gov/data/web/Web2004.query.stream.trecformat.txt",
irds_mirror("10821f7a000b8bec058097ede39570be"),
"https://www.dcs.gla.ac.uk/~craigm/Web2004.query.stream.trecformat.txt"],
"trec")
},
"qrels" :
{
"hp" : filter_on_qid_type,
"td" : filter_on_qid_type,
"np" : filter_on_qid_type,
"all" : ("04.qrels.web.mixed.txt", [
"https://trec.nist.gov/data/web/04.qrels.web.mixed.txt",
irds_mirror("93daa0e4b4190c84e30d2cce78a0f674"),
"https://www.dcs.gla.ac.uk/~craigm/04.qrels.web.mixed.txt"])
},
"info_url" : "https://trec.nist.gov/data/t13.web.html",
}
FIFTY_PCT_INDEX_BASE = "http://www.dcs.gla.ac.uk/~craigm/IR_HM/"
FIFTY_PCT_FILES = {
"index": {
"ex2" : [(filename, FIFTY_PCT_INDEX_BASE + "index/" + filename) for filename in ["data.meta-0.fsomapfile"] + STANDARD_TERRIER_INDEX_FILES],
"ex3" : [(filename, FIFTY_PCT_INDEX_BASE + "ex3/" + filename) for filename in ["data.meta-0.fsomapfile", "data-pagerank.oos"] + STANDARD_TERRIER_INDEX_FILES],
},
"topics": {
"training" : ("training.topics", FIFTY_PCT_INDEX_BASE + "topics/" + "training.topics", "trec"),
"validation" : ("validation.topics", FIFTY_PCT_INDEX_BASE + "topics/" + "validation.topics", "trec"),
},
"qrels": {
"training" : ("training.qrels", FIFTY_PCT_INDEX_BASE + "topics/" + "training.qrels", "trec"),
"validation" : ("validation.qrels", FIFTY_PCT_INDEX_BASE + "topics/" + "validation.qrels", "trec"),
}
}
# a function for the TREC Web track 2009 qrels, to make prels into qrels
def prel2qrel(self, component, variant):
prel_file, _ = self._get_one_file("prels", variant)
df = pd.read_csv(prel_file, sep=" ", names=["qid", "docno", "label", "oth1", "oth2"])[["qid", "docno", "label"]]
df["qid"] = df["qid"].astype(str)
df["docno"] = df["docno"].astype(str)
return (df, "direct")
TREC_WT_2009_FILES = {
"topics" : [
remove_prefix
],
"topics_prefixed" : [
("wt09.topics.queries-only", "https://trec.nist.gov/data/web/09/wt09.topics.queries-only", "singleline")
],
"qrels" : {
"adhoc" : prel2qrel,
"adhoc.catA" : prel2qrel,
"adhoc.catB" : prel2qrel,
},
"prels" : {
"adhoc" : ("prels.1-50.gz", "https://trec.nist.gov/data/web/09/prels.1-50.gz"),
"adhoc.catA" : ("prels.catA.1-50.gz", "https://trec.nist.gov/data/web/09/prels.catA.1-50.gz"),
"adhoc.catB" : ("prels.catB.1-50.gz", "https://trec.nist.gov/data/web/09/prels.catB.1-50.gz")
},
"info_url" : "https://trec.nist.gov/data/web09.html",
}
TREC_WT_2010_FILES = {
"topics" : [
("wt2010-topics.queries-only", "https://trec.nist.gov/data/web/10/wt2010-topics.queries-only", "singleline")
],
"qrels" :
{
"adhoc" : ("qrels.adhoc", "https://trec.nist.gov/data/web/10/10.adhoc-qrels.final")
},
"info_url" : "https://trec.nist.gov/data/web10.html",
}
TREC_WT_2011_FILES = {
"topics" : [
("queries.101-150.txt", "https://trec.nist.gov/data/web/11/queries.101-150.txt", "singleline")
],
"qrels" :
{
"adhoc" : ("qrels.adhoc", "https://trec.nist.gov/data/web/11/qrels.adhoc")
},
"info_url" : "https://trec.nist.gov/data/web2011.html",
}
TREC_WT_2012_FILES = {
"topics" : [
("queries.151-200.txt", "https://trec.nist.gov/data/web/12/queries.151-200.txt", "singleline")
],
"qrels" :
{
"adhoc" : ("qrels.adhoc", "https://trec.nist.gov/data/web/12/qrels.adhoc")
},
"info_url" : "https://trec.nist.gov/data/web2012.html",
}
TREC_WT2G_FILES = {
"qrels" : [ ("qrels.trec8.small_web.gz", "https://trec.nist.gov/data/qrels_eng/qrels.trec8.small_web.gz") ],
"topics" : [ ( "topics.401-450.gz", "https://trec.nist.gov/data/topics_eng/topics.401-450.gz" ) ],
"info_url" : "https://trec.nist.gov/data/t8.web.html",
}
TREC_WT10G_FILES = {
"qrels" : {
"trec9" : ("qrels.trec9.main_web.gz", "https://trec.nist.gov/data/qrels_eng/qrels.trec9.main_web.gz"),
"trec10-adhoc" : ("qrels.trec10.main_web.gz", "https://trec.nist.gov/data/qrels_eng/qrels.trec10.main_web.gz"),
"trec10-hp" : ("qrels.trec10.entrypage.gz", "https://trec.nist.gov/data/qrels_eng/qrels.trec10.entrypage.gz"),
},
"topics" : {
"trec9" : ( "topics.451-500.gz", "https://trec.nist.gov/data/topics_eng/topics.451-500.gz" ),
"trec10-adhoc" : ( "topics.501-550.txt", "https://trec.nist.gov/data/topics_eng/topics.501-550.txt" ),
"trec10-hp" : parse_desc_only
},
"topics_desc_only" : {
"trec10-hp" : ( "entry_page_topics.1-145.txt", "https://trec.nist.gov/data/topics_eng/entry_page_topics.1-145.txt" ),
},
"info_url" : "https://trec.nist.gov/data/t9.web.html",
}
def _merge_years(self, component, variant):
MAP_METHOD = {
"topics" : RemoteDataset.get_topics,
"qrels" : RemoteDataset.get_qrels,
}
dfs = []
low, hi = variant.split("-")
for y in range(int(low), int(hi)+1):
df = MAP_METHOD[component](self, variant=str(y))
dfs.append(df)
return (pd.concat(dfs), "direct")
TREC_TB_FILES = {
"topics" : {
"2004" : ( "04topics.701-750.txt", "https://trec.nist.gov/data/terabyte/04/04topics.701-750.txt" ),
"2005" : ( "04topics.701-750.txt", "https://trec.nist.gov/data/terabyte/05/05.topics.751-800.txt" ),
"2006" : ( "06.topics.801-850.txt", "https://trec.nist.gov/data/terabyte/06/06.topics.801-850.txt" ),
"2004-2006" : ("06.topics.701-850.txt", "https://trec.nist.gov/data/terabyte/06/06.topics.701-850.txt"),
"2006-np" : ( "06.np_topics.901-1081.txt", "https://trec.nist.gov/data/terabyte/06/06.np_topics.901-1081.txt" ),
"2005-np" : ( "05.np_topics.601-872.final.txt", "https://trec.nist.gov/data/terabyte/05/05.np_topics.601-872.final.txt")
},
"qrels" : {
"2004" : ( "04.qrels.12-Nov-04", "https://trec.nist.gov/data/terabyte/04/04.qrels.12-Nov-04"),
"2005" : ( "05.adhoc_qrels", "https://trec.nist.gov/data/terabyte/05/05.adhoc_qrels"),
"2006" : ( "qrels.tb06.top50", "https://trec.nist.gov/data/terabyte/06/qrels.tb06.top50"),
"2004-2006" : _merge_years,
"2005-np" : ( "05.np_qrels", "https://trec.nist.gov/data/terabyte/05/05.np_qrels"),
"2006-np" : ( "qrels.tb06.np", "https://trec.nist.gov/data/terabyte/06/qrels.tb06.np"),
},
"info_url" : "https://trec.nist.gov/data/terabyte.html"
}
TREC_ROBUST_04_FILES = {
"qrels" : [ ("qrels.robust2004.txt", "https://trec.nist.gov/data/robust/qrels.robust2004.txt") ],
"topics" : [ ( "04.testset.gz", "https://trec.nist.gov/data/robust/04.testset.gz" ) ],
"info_url" : "https://trec.nist.gov/data/t13_robust.html",
}
TREC_ROBUST_05_FILES = {
"qrels" : [ ("TREC2005.qrels.txt", "https://trec.nist.gov/data/robust/05/TREC2005.qrels.txt") ],
"topics" : [ ( "05.50.topics.txt", "https://trec.nist.gov/data/robust/05/05.50.topics.txt" ) ],
"info_url" : "https://trec.nist.gov/data/t14_robust.html",
}
TREC_PRECISION_MEDICINE_FILES = {
"topics" : {
"2017" : ("topics2017.xml", "http://www.trec-cds.org/topics2017.xml", "trecxml"),
"2018" : ("topics2018.xml", "http://www.trec-cds.org/topics2018.xml", "trecxml"),
"2019" : ("topics2019.xml", "http://www.trec-cds.org/topics2019.xml", "trecxml"),
"2020" : ("topics2020.xml", "http://www.trec-cds.org/topics2020.xml", "trecxml")
},
"qrels" : {
"qrels-2017-abstracts" : ("qrels-2017-abstracts.txt", "https://trec.nist.gov/data/precmed/qrels-final-abstracts.txt"), #TODO keep original names?
"qrels-2017-abstracts-sample" : ("qrels-2017-abstracts-sample.txt", "https://trec.nist.gov/data/precmed/sample-qrels-final-abstracts.txt"),
"qrels-2017-trials" : ("qrels-2017-trials.txt", "https://trec.nist.gov/data/precmed/qrels-final-trials.txt"),
"qrels-2018-abstracts" : ("qrels-2018-abstracts.txt", "https://trec.nist.gov/data/precmed/qrels-treceval-abstracts-2018-v2.txt"),
"qrels-2018-abstracts-sample" : ("qrels-2018-abstracts-sample.txt", "https://trec.nist.gov/data/precmed/qrels-sample-abstracts-v2.txt"),
"qrels-2018-trials" : ("qrels-2018-trials.txt", "https://trec.nist.gov/data/precmed/qrels-treceval-clinical_trials-2018-v2.txt"),
"qrels-2018-trials-sample" : ("qrels-2018-trials-sample.txt", "https://trec.nist.gov/data/precmed/qrels-sample-trials-v2.txt"),
"qrels-2019-abstracts" : ("qrels-2019-abstracts.txt", "https://trec.nist.gov/data/precmed/qrels-treceval-abstracts.2019.txt"),
"qrels-2019-trials" : ("qrels-2019-trials.txt", "https://trec.nist.gov/data/precmed/qrels-treceval-trials.38.txt"),
"qrels-2019-abstracts-sample" : ("qrels-2019-abstracts-sample.txt", "https://trec.nist.gov/data/precmed/qrels-sampleval-abstracts.2019.txt"),
"qrels-2019-trials-sample" : ("qrels-2019-trials-sample.txt", "https://trec.nist.gov/data/precmed/qrels-sampleval-trials.38.txt")
},
"info_url" : "https://trec.nist.gov/data/precmed.html",
}
VASWANI_CORPUS_BASE = "https://raw.githubusercontent.com/terrier-org/pyterrier/master/tests/fixtures/vaswani_npl/"
VASWANI_INDEX_BASE = "https://raw.githubusercontent.com/terrier-org/pyterrier/master/tests/fixtures/index/"
VASWANI_FILES = {
"corpus": [("doc-text.trec", [
VASWANI_CORPUS_BASE + "corpus/doc-text.trec",
irds_mirror("a059e713c50350e39999467c8c73b7c5")])],
"topics": [("query-text.trec", [
VASWANI_CORPUS_BASE + "query-text.trec",
irds_mirror("3a624be2b0ef7c9534cf848891679bec")])],
"qrels": [("qrels", [
VASWANI_CORPUS_BASE + "qrels",
irds_mirror("6acb6db9969da8b8c6c23c09551af8d9")])],
"index": _datarepo_index_default_none,
#"index":
# [(filename, VASWANI_INDEX_BASE + filename) for filename in STANDARD_TERRIER_INDEX_FILES + ["data.meta-0.fsomapfile"]],
"info_url" : "http://ir.dcs.gla.ac.uk/resources/test_collections/npl/",
"corpus_iter" : lambda dataset, **kwargs : pyterrier.index.treccollection2textgen(dataset.get_corpus(), num_docs=11429, verbose=kwargs.get("verbose", False))
}
DATASET_MAP = {
# used for UGlasgow teaching
"50pct" : RemoteDataset("50pct", FIFTY_PCT_FILES),
# umass antique corpus - see http://ciir.cs.umass.edu/downloads/Antique/
"antique" : RemoteDataset("antique", ANTIQUE_FILES),
# generated from http://ir.dcs.gla.ac.uk/resources/test_collections/npl/
"vaswani": RemoteDataset("vaswani", VASWANI_FILES),
"msmarco_document" : RemoteDataset("msmarco_document", MSMARCO_DOC_FILES),
"msmarcov2_document" : RemoteDataset("msmarcov2_document", MSMARCOv2_DOC_FILES),
"msmarco_passage" : RemoteDataset("msmarco_passage", MSMARCO_PASSAGE_FILES),
"msmarcov2_passage" : RemoteDataset("msmarcov2_passage", MSMARCOv2_PASSAGE_FILES),
"trec-robust-2004" : RemoteDataset("trec-robust-2004", TREC_ROBUST_04_FILES),
"trec-robust-2005" : RemoteDataset("trec-robust-2005", TREC_ROBUST_05_FILES),
"trec-terabyte" : RemoteDataset("trec-terabyte", TREC_TB_FILES),
#medical-like tracks
"trec-precision-medicine" : RemoteDataset("trec-precicion-medicine", TREC_PRECISION_MEDICINE_FILES),
"trec-covid" : RemoteDataset("trec-covid", TREC_COVID_FILES),
#wt2g
"trec-wt2g" : RemoteDataset("trec-wt2g", TREC_WT2G_FILES),
#wt10g
"trec-wt10g" : RemoteDataset("trec-wt10g", TREC_WT10G_FILES),
#.gov
"trec-wt-2002" : RemoteDataset("trec-wt-2002", TREC_WT_2002_FILES),
"trec-wt-2003" : RemoteDataset("trec-wt-2003", TREC_WT_2002_FILES),
"trec-wt-2004" : RemoteDataset("trec-wt-2004", TREC_WT_2004_FILES),
#clueweb09
"trec-wt-2009" : RemoteDataset("trec-wt-2009", TREC_WT_2009_FILES),
"trec-wt-2010" : RemoteDataset("trec-wt-2010", TREC_WT_2010_FILES),
"trec-wt-2011" : RemoteDataset("trec-wt-2011", TREC_WT_2011_FILES),
"trec-wt-2012" : RemoteDataset("trec-wt-2012", TREC_WT_2012_FILES),
}
# Include all datasets from ir_datasets with "irds:" prefix so they don't conflict with pt dataset names
# Results in records like:
# irds:antique
# irds:antique/test
# irds:antique/test/non-offensive
# irds:antique/train
# ...
import ir_datasets
for ds_id in ir_datasets.registry:
DATASET_MAP[f'irds:{ds_id}'] = IRDSDataset(ds_id, defer_load=True)
# "trec-deep-learning-docs"
#DATASET_MAP['msmarco_document'] = DATASET_MAP["trec-deep-learning-docs"]
#DATASET_MAP['msmarco_passage'] = DATASET_MAP["trec-deep-learning-passages"]
DATASET_MAP["trec-deep-learning-docs"] = DATASET_MAP['msmarco_document']
DATASET_MAP["trec-deep-learning-passages"] = DATASET_MAP['msmarco_passage']
[docs]def get_dataset(name, **kwargs):
"""
Get a dataset by name
"""
# Some datasets in ir_datasets are built on-the-fly (e.g., clirmatrix).
# Handle this by allocating it on demand here.
if name not in DATASET_MAP and name.startswith('irds:'):
# remove irds: prefix
ds_id = name[len('irds:'):]
DATASET_MAP[name] = IRDSDataset(ds_id)
rtr = DATASET_MAP[name]
rtr._configure(**kwargs)
return rtr
def datasets():
"""
Lists all the names of the datasets
"""
return DATASET_MAP.keys()
[docs]def find_datasets(query, en_only=True):
"""
A grep-like method to help identify datasets. Filters the output of list_datasets() based on the name containing the query
"""
datasets = list_datasets(en_only=en_only)
return datasets[datasets['dataset'].str.contains(query)]
[docs]def list_datasets(en_only=True):
"""
Returns a dataframe of all datasets, listing which topics, qrels, corpus files or indices are available.
By default, filters to only datasets with both a corpus and topics in English.
"""
import pandas as pd
rows=[]
for k in datasets():
dataset = get_dataset(k)
rows.append([
k,
dataset._describe_component("topics"),
dataset.get_topics_lang(),
dataset._describe_component("qrels"),
dataset._describe_component("corpus"),
dataset.get_corpus_lang(),
dataset._describe_component("index"),
dataset.info_url() ])
result = pd.DataFrame(rows, columns=["dataset", "topics", "topics_lang", "qrels", "corpus", "corpus_lang", "index", "info_url"])
if en_only:
topics_filter = (result['topics'].isnull()) | (result['topics_lang'] == 'en')
corpus_filter = (result['corpus'].isnull()) | (result['corpus_lang'] == 'en')
result = result[topics_filter & corpus_filter]
return result