Source code for pyterrier.datasets

import urllib.request
import wget
import os
import pandas as pd
from .transformer import is_lambda
import types
from typing import Union, Tuple, Iterator, Dict, Any, List
from warnings import warn
import requests
from .io import autoopen, touch
from . import tqdm, HOME_DIR
import tarfile
from warnings import warn

import pyterrier


class GeneratorLen(object):
    def __init__(self, gen, length):
        self.gen = gen
        self.length = length

    def __len__(self): 
        return self.length

    def __iter__(self):
        return self.gen

[docs]class Dataset(): """ Represents a dataset (test collection) for indexing or retrieval. A common use-case is to use the Dataset within an Experiment:: dataset = pt.get_dataset("trec-robust-2004") pt.Experiment([br1, br2], dataset.get_topics(), dataset.get_qrels(), eval_metrics=["map", "recip_rank"]) """ def _configure(self, **kwargs): pass
[docs] def get_corpus(self): """ Returns the location of the files to allow indexing the corpus, i.e. it returns a list of filenames. """ pass
[docs] def get_corpus_iter(self, verbose=True) -> Iterator[Dict[str,Any]]: """ Returns an iter of dicts for this collection. If verbose=True, a tqdm pbar shows the progress over this iterator. """ pass
[docs] def get_corpus_lang(self) -> Union[str,None]: """ Returns the ISO 639-1 language code for the corpus, or None for multiple/other/unknown """ return None
[docs] def get_index(self, variant=None, **kwargs): """ Returns the IndexRef of the index to allow retrieval. Only a few datasets provide indices ready made. """ pass
[docs] def get_topics(self, variant=None) -> pd.DataFrame: """ Returns the topics, as a dataframe, ready for retrieval. """ pass
[docs] def get_topics_lang(self) -> Union[str,None]: """ Returns the ISO 639-1 language code for the topics, or None for multiple/other/unknown """ return None
[docs] def get_qrels(self, variant=None) -> pd.DataFrame: """ Returns the qrels, as a dataframe, ready for evaluation. """ pass
[docs] def get_topicsqrels(self, variant=None) -> Tuple[pd.DataFrame,pd.DataFrame]: """ Returns both the topics and qrels in a tuple. This is useful for pt.Experiment(). """ return ( self.get_topics(variant=variant), self.get_qrels(variant=variant) )
[docs] def info_url(self): """ Returns a url that provides more information about this dataset. """ return None
[docs] def get_results(self, variant=None) -> pd.DataFrame: """ Returns a standard result set provided by the dataset. This is useful for re-ranking experiments. """ pass
class RemoteDataset(Dataset): def __init__(self, name, locations): self.locations = locations = name self.user = None self.password = None def _configure(self, **kwargs): from os.path import expanduser pt_home = HOME_DIR if pt_home is None: from os.path import expanduser userhome = expanduser("~") pt_home = os.path.join(userhome, ".pyterrier") self.corpus_home = os.path.join(pt_home, "corpora", if 'user' in kwargs: self.user = kwargs['user'] self.password = kwargs['password'] @staticmethod def download(URLs : Union[str,List[str]], filename : str, **kwargs): import pyterrier as pt basename = os.path.basename(filename) if isinstance(URLs, str): URLs = [URLs] finalattempt=len(URLs)-1 error = None for i, url in enumerate(URLs): try: r = requests.get(url, allow_redirects=True, stream=True, **kwargs) r.raise_for_status() total = int(r.headers.get('content-length', 0)) with, 'b') as file, tqdm( desc=basename, total=total, unit='iB', unit_scale=True, unit_divisor=1024, ) as bar: for data in r.iter_content(chunk_size=1024): size = file.write(data) bar.update(size) break except Exception as e: if error is not None: e.__cause__ = error # chain errors to show all if fails error = e if i == finalattempt: raise error else: warn("Problem fetching %s, resorting to next mirror" % url) def _check_variant(self, component, variant=None): if not component in self.locations: raise ValueError("No %s in dataset %s" % (component, name)) if variant is None: if not isinstance(self.locations[component], list): raise ValueError("For %s in dataset %s, you must specify a variant. Available are: %s" % (component, name, str(list(self.locations[component].keys())))) else: if isinstance(self.locations[component], list): raise ValueError("For %s in dataset %s, there are no variants, but you specified %s" % (component, name, variant)) if not variant in self.locations[component]: raise ValueError("For %s in dataset %s, there is no variant %s. Available are: %s" % (component, name, variant, str(list(self.locations[component].keys())))) def _get_one_file(self, component, variant=None): filetype=None self._check_variant(component, variant) location = self.locations[component][0] if variant is None else self.locations[component][variant] if is_lambda(location) or isinstance(location, types.FunctionType): argcount = location.__code__.co_argcount if argcount == 0: return location() elif argcount == 3: return location(self, component, variant) else: raise TypeError("Expected function with 0 or 3 arguments for %s %s %s" % (component, name, variant)) local = location[0] URL = location[1] if len(location) > 2: filetype = location[2] if not os.path.exists(self.corpus_home): os.makedirs(self.corpus_home) local = os.path.join(self.corpus_home, local) actualURL = URL if isinstance(URL, str) else URL[0] if "#" in actualURL and not os.path.exists(local): tarname, intarfile = actualURL.split("#") assert not "/" in intarfile assert ".tar" in tarname or ".tgz" in tarname localtarfile, _ = self._get_one_file("tars", tarname) tarobj =, "r") tarobj.extract(intarfile, path=self.corpus_home) os.rename(os.path.join(self.corpus_home, intarfile), local) return (local, filetype) if not os.path.exists(local): try: print("Downloading %s %s to %s" % (, component, local)) kwargs = {} if self.user is not None: kwargs["auth"]=(self.user, self.password), local, **kwargs) except urllib.error.HTTPError as he: raise ValueError("Could not fetch " + URL) from he return (local, filetype) def _get_all_files(self, component, variant=None, **kwargs): if variant is None: localDir = os.path.join(self.corpus_home, component) else: localDir = os.path.join(self.corpus_home, component, variant) kwargs = {} if self.user is not None: kwargs["auth"]=(self.user, self.password) direxists = os.path.exists(localDir) location = self.locations[component] if is_lambda(location) or isinstance(location, types.FunctionType): # functions are expensive to call, normally another HTTP is needed. # just assume we have everthing we need if we have the local directory already # and it contains a .complete file. if direxists and os.path.exists(os.path.join(localDir, ".complete")): return localDir # call the function, and get the file list file_list = location(self, component, variant, **kwargs) else: file_list = self.locations[component] if variant is None else self.locations[component][variant] if not direxists: os.makedirs(localDir) print("Downloading %s %s to %s" % (, component, localDir)) # check for how much space is required and available space def _totalsize(file_list): total = -1 for f in file_list: if len(f) > 2: total += f[2] if total != -1: total += 1 return total totalsize = _totalsize(file_list) if totalsize > 0: import shutil total, used, free = shutil.disk_usage(localDir) if free < totalsize: raise ValueError("Insufficient freedisk space at %s to download index" % localDir) if totalsize > 2 * 2**30: warn("Downloading index of > 2GB.") for fileentry in file_list: local = fileentry[0] URL = fileentry[1] expectedlength = -1 if len(fileentry) == 3: expectedlength = fileentry[2] local = os.path.join(localDir, local) # if file exists and we know length, check if dowload is complete fileexists = os.path.exists(local) if fileexists and expectedlength >= 0: length = os.stat(local).st_size if expectedlength != length: warn("Removing partial download of %s (expected %d bytes, found %d)" % (local, expectedlength, length )) os.remove(local) fileexists = False if not fileexists: if "#" in URL: tarname, intarfile = URL.split("#") assert not "/" in intarfile assert ".tar" in tarname or ".tgz" in tarname localtarfile, _ = self._get_one_file("tars", tarname) tarobj =, "r") tarobj.extract(intarfile, path=self.corpus_home) local = os.path.join(self.corpus_home, local) #TODO, files could be recompressed here to save space os.rename(os.path.join(self.corpus_home, intarfile), local) else: try:, local, **kwargs) except urllib.error.HTTPError as he: raise ValueError("Could not fetch " + URL) from he # verify file if exists if expectedlength >= 0: length = os.stat(local).st_size if expectedlength != length: raise ValueError("Failed download of %s to %s (expected %d bytes, found %d)" % (URL, local, expectedlength, length )) # finally, touch a file signifying that download has been completed touch(os.path.join(localDir, ".complete")) return localDir def _describe_component(self, component): if component not in self.locations: return None if type(self.locations[component]) == type([]): return True if isinstance(self.locations[component], dict): return list(self.locations[component].keys()) return True def get_corpus(self, **kwargs): import pyterrier as pt return list(filter(lambda f : not f.endswith(".complete"),"corpus", **kwargs)))) def get_corpus_iter(self, **kwargs): if not "corpus_iter" in self.locations: raise ValueError("Cannot supply a corpus iterator on dataset %s" % return self.locations["corpus_iter"](self, **kwargs) def get_corpus_lang(self): if 'corpus' in self.locations: return 'en' # all are english return None def get_qrels(self, variant=None): import pyterrier as pt filename, type = self._get_one_file("qrels", variant) if type == "direct": return filename return def get_topics(self, variant=None, **kwargs): import pyterrier as pt file, filetype = self._get_one_file("topics", variant) if filetype is None or filetype in return, format=filetype, **kwargs) elif filetype == "direct": return file raise ValueError("Unknown filetype %s for %s topics %s" % (filetype,, variant)) def get_topics_lang(self): if 'topics' in self.locations: return 'en' # all are english return None def get_index(self, variant=None, **kwargs): import pyterrier as pt if == "50pct" and variant is None: variant="ex1" thedir = self._get_all_files("index", variant=variant, **kwargs) return thedir #return pt.autoclass("org.terrier.querying.IndexRef").of(os.path.join(thedir, "")) def __repr__(self): return "RemoteDataset for %s, with %s" % (, str(list(self.locations.keys()))) def info_url(self): return self.locations['info_url'] if "info_url" in self.locations else None class IRDSDataset(Dataset): def __init__(self, irds_id, defer_load=False): self._irds_id = irds_id self._irds_ref = None if defer_load else ir_datasets.load(self._irds_id) def irds_ref(self): if self._irds_ref is None: self._irds_ref = ir_datasets.load(self._irds_id) return self._irds_ref def get_corpus(self): raise NotImplementedError("IRDSDataset doesn't support get_corpus; use get_corpus_iter instead. If you " "are indexing, get_corpus_iter should be used in conjunction with IterDictIndexer.") def get_corpus_iter(self, verbose=True, start=0, count=None): ds = self.irds_ref() assert ds.has_docs(), f"{self._irds_id} doesn't support get_corpus_iter" it = ds.docs_iter() total = ds.docs_count() # use slicing if requested if start > 0 or count is not None: if count is not None: it = it[start:start+count] total = count else: it = it[start:] total -= start # tqdm support if verbose: it = tqdm(it, desc=f'{self._irds_id} documents', total=total) # rewrite to follow pyterrier std def gen(): for doc in it: doc = doc._asdict() # pyterrier uses "docno" doc['docno'] = doc.pop('doc_id') yield doc # ensure we can provide accurate len return GeneratorLen(gen(), total) def get_corpus_lang(self): ds = self.irds_ref() if ds.has_docs(): return ds.docs_lang() return None def get_index(self, variant=None): # this is only for indices where Terrier provides an index already raise NotImplementedError("IRDSDataset doesn't support get_index") def get_topics(self, variant=None, tokenise_query=True): """ Returns the topics, as a dataframe, ready for retrieval. """ ds = self.irds_ref() assert ds.has_queries(), f"{self._irds_id} doesn't support get_topics" qcls = ds.queries_cls() assert variant is None or variant in qcls._fields[1:], f"{self._irds_id} only supports the following topic variants {qcls._fields[1:]}" df = pd.DataFrame(ds.queries_iter()) df.rename(columns={"query_id": "qid"}, inplace=True) # pyterrier uses "qid" if variant is not None: # Some datasets have a query field called "query". We need to remove it or # we'll end up with multiple "query" columns, which will cause problems # because many components are written assuming no columns have the same name. if variant != 'query' and 'query' in df.columns: df.drop(columns=['query'], axis=1, inplace=True) df.rename(columns={variant: "query"}, inplace=True) # user specified which version of the query they want df.drop(columns=df.columns.difference(['qid','query']), axis=1, inplace=True) elif len(qcls._fields) == 2: # auto-rename single query field to "query" if there's only query_id and that field df.rename(columns={qcls._fields[1]: "query"}, inplace=True) else: print(f'There are multiple query fields available: {qcls._fields[1:]}. To use with pyterrier, provide variant or modify dataframe to add query column.') # apply pyterrier tokenisation (otherwise the queries may not play well with batchretrieve) if tokenise_query and 'query' in df: import pyterrier as pt tokeniser = pt.autoclass("org.terrier.indexing.tokenisation.Tokeniser").getTokeniser() def pt_tokenise(text): return ' '.join(tokeniser.getTokens(text)) df['query'] = df['query'].apply(pt_tokenise) return df def get_topics_lang(self): ds = self.irds_ref() if ds.has_queries(): return ds.queries_lang() return None def get_qrels(self, variant=None): """ Returns the qrels, as a dataframe, ready for evaluation. """ ds = self.irds_ref() assert ds.has_qrels(), f"{self._irds_id} doesn't support get_qrels" qrelcls = ds.qrels_cls() qrel_fields = [f for f in qrelcls._fields if f not in ('query_id', 'doc_id', 'iteration')] assert variant is None or variant in qrel_fields, f"{self._irds_id} only supports the following qrel variants {qrel_fields}" df = pd.DataFrame(ds.qrels_iter()) # pyterrier uses "qid" and "docno" df.rename(columns={ "query_id": "qid", "doc_id": "docno"}, inplace=True) # pyterrier uses "label" if variant is not None: df.rename(columns={variant: "label"}, inplace=True) if len(qrel_fields) == 1: # usually "relevance" df.rename(columns={qrel_fields[0]: "label"}, inplace=True) elif 'relevance' in qrel_fields: print(f'There are multiple qrel fields available: {qrel_fields}. Defaulting to "relevance", but to use a different one, supply variant') df.rename(columns={'relevance': "label"}, inplace=True) else: print(f'There are multiple qrel fields available: {qrel_fields}. To use with pyterrier, provide variant or modify dataframe to add query column.') return df def get_results(self, variant=None) -> pd.DataFrame: """ Returns a standard result set provided by the dataset. This is useful for re-ranking experiments. """ ds = self.irds_ref() assert ds.has_scoreddocs(), f"{self._irds_id} doesn't support get_reranking_run" result = pd.DataFrame(ds.scoreddocs) result = result.rename(columns={'query_id': 'qid', 'doc_id': 'docno'}) # convert irds field names to pyterrier names result.sort_values(by=['qid', 'score', 'docno'], ascending=[True, False, True], inplace=True) # ensure data is sorted by qid, -score, did # result doesn't yet contain queries (only qids) so load and merge them in topics = self.get_topics(variant) result = pd.merge(result, topics, how='left', on='qid', copy=False) return result def _describe_component(self, component): ds = self.irds_ref() if component == "topics": if ds.has_queries(): fields = ds.queries_cls()._fields[1:] if len(fields) > 1: return list(fields) return True return None if component == "qrels": if ds.has_qrels(): fields = [f for f in ds.qrels_cls()._fields if f not in ('query_id', 'doc_id', 'iteration')] if len(fields) > 1: return list(fields) return True return None if component == "corpus": return ds.has_docs() or None if component == "results": return ds.has_scoreddocs() or None return None def info_url(self): top_id = self._irds_id.split('/', 1)[0] suffix = f'#{self._irds_id}' if top_id != self._irds_id else '' return f'{top_id}.html{suffix}' def __repr__(self): return f"IRDSDataset({repr(self._irds_id)})" def passage_generate(dataset): for filename in dataset.get_corpus(): with autoopen(filename, 'rt') as corpusfile: for l in corpusfile: #for each line docno, passage = l.split("\t") yield {'docno' : docno, 'text' : passage} def _datarepo_index(self, component, variant=None, version='latest', **kwargs): if variant is None: raise ValueError(f"Must specify index variant for {}. See{}.dataset.html") urlprefix= f"{}/{variant}/{version}/" url = urlprefix + "files" try: r = requests.get(url, **kwargs) r.raise_for_status() file = r.text.splitlines() except Exception as e: raise ValueError(f"Could not find index variant {variant} for dataset {} at {url}. See available variants at{}.dataset.html") from e rtr = [] import re for linenum, line in enumerate(file): # skip comments if line.startswith("#"): continue try: (length, filename) = re.split(r"\s+", line.strip(), 2) rtr.append((filename, urlprefix+filename, int(length))) except Exception as e: raise ValueError(f"Could not parse {url} line {linenum} '{line}'") from e return rtr def _datarepo_index_default_none(self, component, variant=None, version='latest', **kwargs): """ For backward compatability with vaswani - use default for variant """ if variant is None: variant = 'terrier_stemmed' return _datarepo_index(self, component, variant=variant, version=version, **kwargs) ANTIQUE_FILES = { "topics" : { "train" : ("antique-train-queries.txt", "", "singleline"), "test" : ("antique-test-queries.txt", "", "singleline"), }, "qrels" : { "train" : ("antique-train.qrel", "", "singleline"), "test" : ("antique-test.qrel", "", "singleline"), }, "corpus" : [("antique-collection.txt", "")], "info_url" : "", "corpus_iter" : passage_generate } TREC_COVID_FILES = { "topics" : { "round1" : ("topics-rnd1.xml", "", "trecxml"), "round2" : ("topics-rnd2.xml", "", "trecxml"), "round3" : ("topics-rnd3.xml", "", "trecxml"), "round4" : ("topics-rnd4.xml", "", "trecxml"), "round5" : ("topics-rnd5.xml", "", "trecxml"), }, "qrels" : { "round1" : ("qrels-rnd1.txt", ""), "round2" : ("qrels-rnd2.txt", ""), "round3" : ("qrels-rnd3.txt", ""), "round3-cumulative" : ("qrels-rnd3-cumulative.txt", ""), "round4" : ("qrels-rnd4.txt", ""), "round4-cumulative" : ("qrels-rnd4-cumulative.txt", ""), "round5" : ("qrels-covid_d5_j4.5-5.txt", ""), }, "corpus" : { "round4": ("round4.tar.gz", ""), "round5": ("round5.tar.gz", ""), }, "docids" : { "docids-rnd3" : ("docids-rnd3.txt", ""), "docids-rnd4" : ("docids-rnd4.txt", ""), "docids-rnd5" : ("docids-rnd5.txt", "") }, "info_url" : "", "index": _datarepo_index } def msmarco_document_generate(dataset): for filename in dataset.get_corpus(variant="corpus-tsv"): with autoopen(filename, 'rt') as corpusfile: for l in corpusfile: #for each line docno, url, title, passage = l.split("\t") yield {'docno' : docno, 'url' : url, 'title' : title, 'text' : passage} MSMARCO_DOC_FILES = { "corpus" : [("msmarco-docs.trec.gz", "")], "corpus-tsv": [("msmarco-docs.tsv.gz", "")], "topics" : { "train" : ("msmarco-doctrain-queries.tsv.gz", "", "singleline"), "dev" : ("msmarco-docdev-queries.tsv.gz", "", "singleline"), "test" : ("msmarco-test2019-queries.tsv.gz", "", "singleline"), "test-2020" : ("msmarco-test2020-queries.tsv.gz" , "", "singleline"), 'leaderboard-2020' : ("docleaderboard-queries.tsv.gz" , "", "singleline") }, "qrels" : { "train" : ("msmarco-doctrain-qrels.tsv.gz", ""), "dev" : ("msmarco-docdev-qrels.tsv.gz", ""), "test" : ("2019qrels-docs.txt", ""), "test-2020" : ("2020qrels-docs.txt", "") }, "info_url" : "", "corpus_iter" : msmarco_document_generate, "index" : _datarepo_index } MSMARCO_PASSAGE_FILES = { "corpus" : [("collection.tsv", "collection.tar.gz#collection.tsv")], "index": { "terrier_stemmed" : [(filename, TERRIER_DATA_BASE + "/msmarco_passage/terrier_stemmed/latest/" + filename) for filename in STANDARD_TERRIER_INDEX_FILES], "terrier_unstemmed" : [(filename, TERRIER_DATA_BASE + "/msmarco_passage/terrier_unstemmed/latest/" + filename) for filename in STANDARD_TERRIER_INDEX_FILES], "terrier_stemmed_text" : [(filename, TERRIER_DATA_BASE + "/msmarco_passage/terrier_stemmed_text/latest/" + filename) for filename in STANDARD_TERRIER_INDEX_FILES], "terrier_unstemmed_text" : [(filename, TERRIER_DATA_BASE + "/msmarco_passage/terrier_unstemmed_text/latest/" + filename) for filename in STANDARD_TERRIER_INDEX_FILES], "terrier_stemmed_deepct" : [(filename, TERRIER_DATA_BASE + "/msmarco_passage/terrier_stemmed_deepct/latest/" + filename) for filename in STANDARD_TERRIER_INDEX_FILES], "terrier_stemmed_docT5query" : [(filename, TERRIER_DATA_BASE + "/msmarco_passage/terrier_stemmed_docT5query/latest/" + filename) for filename in STANDARD_TERRIER_INDEX_FILES], }, "topics" : { "train" : ("queries.train.tsv", "queries.tar.gz#queries.train.tsv", "singleline"), "dev" : ("", "", "singleline"), "dev.small" : ("", "", "singleline"), "eval" : ("queries.eval.tsv", "queries.tar.gz#queries.eval.tsv", "singleline"), "eval.small" : ("queries.eval.small.tsv", "collectionandqueries.tar.gz#queries.eval.small.tsv", "singleline"), "test-2019" : ("msmarco-test2019-queries.tsv.gz", "", "singleline"), "test-2020" : ("msmarco-test2020-queries.tsv.gz", "", "singleline") }, "tars" : { "queries.tar.gz" : ("queries.tar.gz", ""), "collection.tar.gz" : ("collection.tar.gz", ""), "collectionandqueries.tar.gz" : ("collectionandqueries.tar.gz", "") }, "qrels" : { "train" : ("qrels.train.tsv", ""), "dev" : ("", ""), "test-2019" : ("2019qrels-docs.txt", ""), "test-2020" : ("2020qrels-docs.txt", ""), "dev.small" : ("", ""), }, "info_url" : "", "corpus_iter" : passage_generate, "index" : _datarepo_index } MSMARCOv2_DOC_FILES = { "info_url" : "", "topics" : { "train" : ("docv2_train_queries.tsv", "", "singleline"), "dev1" :("docv2_dev_queries.tsv", "", "singleline"), "dev2" :("docv2_dev2_queries.tsv", "", "singleline"), "valid1" : ("msmarco-test2019-queries.tsv.gz" , "", "singleline"), "valid2" : ("msmarco-test2020-queries.tsv.gz" , "", "singleline"), "trec_2021" : ("2021_queries.tsv" , "", "singleline"), }, "qrels" : { "train" : ("docv2_train_qrels.tsv", ""), "dev1" :("docv2_dev_qrels.tsv", ""), "dev2" :("docv2_dev2_qrels.tsv", ""), "valid1" : ("docv2_trec2019_qrels.txt.gz" , ""), "valid2" : ("docv2_trec2020_qrels.txt.gz" , "") }, "index" : _datarepo_index, } MSMARCOv2_PASSAGE_FILES = { "info_url" : "", "topics" : { "train" : ("passv2_train_queries.tsv", "", "singleline"), "dev1" : ("passv2_dev_queries.tsv", "", "singleline"), "dev2" : ("passv2_dev2_queries.tsv", "", "singleline"), "trec_2021" : ("2021_queries.tsv" , "", "singleline"), }, "qrels" : { "train" : ("passv2_train_qrels.tsv" ""), "dev1" : ("passv2_dev_qrels.tsv", ""), "dev2" : ("passv2_dev2_qrels.tsv", ""), }, "index" : _datarepo_index, } # remove WT- prefix from topics def remove_prefix(self, component, variant): import pyterrier as pt topics_file, type = self._get_one_file("topics_prefixed", variant) if type in topics =, type) else: raise ValueError("Unknown topic type %s" % type) topics["qid"] = topics.apply(lambda row: row["qid"].split("-")[1], axis=1) return (topics, "direct") # a function to fix the namedpage TREC Web tracks 2001 and 2002 def parse_desc_only(self, component, variant): import pyterrier as pt file, type = self._get_one_file("topics_desc_only", variant=variant) topics =, format="trec", whitelist=["DESC"], blacklist=None) topics["qid"] = topics.apply(lambda row: row["qid"].replace("NP", ""), axis=1) topics["qid"] = topics.apply(lambda row: row["qid"].replace("EP", ""), axis=1) return (topics, "direct") TREC_WT_2002_FILES = { "topics" : { "td" : ("webtopics_551-600.txt.gz", "", "trec"), "np" : parse_desc_only }, "topics_desc_only" : { "np" : ("webnamed_page_topics.1-150.txt.gz", "", "trec") }, "qrels" : { "np" : ("qrels.named-page.txt.gz", ""), "td" : ("qrels.distillation.txt.gz", "") }, "info_url" : "", } TREC_WT_2003_FILES = { "topics" : { "np" : ("webtopics_551-600.txt.gz", "", "trec"), "td" : ("2003.distillation_topics.1-50.txt", "", "trec"), }, "qrels" : { "np" : ("qrels.named-page.txt.gz", ""), "td" : ("qrels.distillation.2003.txt", "") }, "info_url" : "", } def irds_mirror(md5): return f'{md5}' def filter_on_qid_type(self, component, variant): if component == "topics": data = self.get_topics("all") elif component == "qrels": data = self.get_qrels("all") qid2type_file = self._get_one_file("topics_map")[0] qid2type = pd.read_csv(qid2type_file, names=["qid", "type"], sep=" ") qid2type["qid"] = qid2type.apply(lambda row: row["qid"].split("-")[1], axis=1) rtr = data.merge(qid2type[qid2type["type"] == variant], on=["qid"]) if len(rtr) == 0: raise ValueError("No such topic type '%s'" % variant) rtr.drop(columns=['type'], inplace=True) return (rtr, "direct") TREC_WT_2004_FILES = { "topics" : { "all" : remove_prefix, "np": filter_on_qid_type, "hp": filter_on_qid_type, "td": filter_on_qid_type, }, "topics_map" : [("04.topic-map.official.txt", [ "", irds_mirror("79737768b3be1aa07b14691aa54802c5"), "" ] )], "topics_prefixed" : { "all" : ("", [ "", irds_mirror("10821f7a000b8bec058097ede39570be"), ""], "trec") }, "qrels" : { "hp" : filter_on_qid_type, "td" : filter_on_qid_type, "np" : filter_on_qid_type, "all" : ("04.qrels.web.mixed.txt", [ "", irds_mirror("93daa0e4b4190c84e30d2cce78a0f674"), ""]) }, "info_url" : "", } FIFTY_PCT_INDEX_BASE = "" FIFTY_PCT_FILES = { "index": { "ex2" : [(filename, FIFTY_PCT_INDEX_BASE + "index/" + filename) for filename in ["data.meta-0.fsomapfile"] + STANDARD_TERRIER_INDEX_FILES], "ex3" : [(filename, FIFTY_PCT_INDEX_BASE + "ex3/" + filename) for filename in ["data.meta-0.fsomapfile", "data-pagerank.oos"] + STANDARD_TERRIER_INDEX_FILES], }, "topics": { "training" : ("training.topics", FIFTY_PCT_INDEX_BASE + "topics/" + "training.topics", "trec"), "validation" : ("validation.topics", FIFTY_PCT_INDEX_BASE + "topics/" + "validation.topics", "trec"), }, "qrels": { "training" : ("training.qrels", FIFTY_PCT_INDEX_BASE + "topics/" + "training.qrels", "trec"), "validation" : ("validation.qrels", FIFTY_PCT_INDEX_BASE + "topics/" + "validation.qrels", "trec"), } } # a function for the TREC Web track 2009 qrels, to make prels into qrels def prel2qrel(self, component, variant): prel_file, _ = self._get_one_file("prels", variant) df = pd.read_csv(prel_file, sep=" ", names=["qid", "docno", "label", "oth1", "oth2"])[["qid", "docno", "label"]] df["qid"] = df["qid"].astype(str) df["docno"] = df["docno"].astype(str) return (df, "direct") TREC_WT_2009_FILES = { "topics" : [ remove_prefix ], "topics_prefixed" : [ ("wt09.topics.queries-only", "", "singleline") ], "qrels" : { "adhoc" : prel2qrel, "adhoc.catA" : prel2qrel, "adhoc.catB" : prel2qrel, }, "prels" : { "adhoc" : ("prels.1-50.gz", ""), "adhoc.catA" : ("prels.catA.1-50.gz", ""), "adhoc.catB" : ("prels.catB.1-50.gz", "") }, "info_url" : "", } TREC_WT_2010_FILES = { "topics" : [ ("wt2010-topics.queries-only", "", "singleline") ], "qrels" : { "adhoc" : ("qrels.adhoc", "") }, "info_url" : "", } TREC_WT_2011_FILES = { "topics" : [ ("queries.101-150.txt", "", "singleline") ], "qrels" : { "adhoc" : ("qrels.adhoc", "") }, "info_url" : "", } TREC_WT_2012_FILES = { "topics" : [ ("queries.151-200.txt", "", "singleline") ], "qrels" : { "adhoc" : ("qrels.adhoc", "") }, "info_url" : "", } TREC_WT2G_FILES = { "qrels" : [ ("qrels.trec8.small_web.gz", "") ], "topics" : [ ( "topics.401-450.gz", "" ) ], "info_url" : "", } TREC_WT10G_FILES = { "qrels" : { "trec9" : ("qrels.trec9.main_web.gz", ""), "trec10-adhoc" : ("qrels.trec10.main_web.gz", ""), "trec10-hp" : ("qrels.trec10.entrypage.gz", ""), }, "topics" : { "trec9" : ( "topics.451-500.gz", "" ), "trec10-adhoc" : ( "topics.501-550.txt", "" ), "trec10-hp" : parse_desc_only }, "topics_desc_only" : { "trec10-hp" : ( "entry_page_topics.1-145.txt", "" ), }, "info_url" : "", } def _merge_years(self, component, variant): MAP_METHOD = { "topics" : RemoteDataset.get_topics, "qrels" : RemoteDataset.get_qrels, } dfs = [] low, hi = variant.split("-") for y in range(int(low), int(hi)+1): df = MAP_METHOD[component](self, variant=str(y)) dfs.append(df) return (pd.concat(dfs), "direct") TREC_TB_FILES = { "topics" : { "2004" : ( "04topics.701-750.txt", "" ), "2005" : ( "04topics.701-750.txt", "" ), "2006" : ( "06.topics.801-850.txt", "" ), "2004-2006" : ("06.topics.701-850.txt", ""), "2006-np" : ( "06.np_topics.901-1081.txt", "" ), "2005-np" : ( "", "") }, "qrels" : { "2004" : ( "04.qrels.12-Nov-04", ""), "2005" : ( "05.adhoc_qrels", ""), "2006" : ( "qrels.tb06.top50", ""), "2004-2006" : _merge_years, "2005-np" : ( "05.np_qrels", ""), "2006-np" : ( "", ""), }, "info_url" : "" } TREC_ROBUST_04_FILES = { "qrels" : [ ("qrels.robust2004.txt", "") ], "topics" : [ ( "04.testset.gz", "" ) ], "info_url" : "", } TREC_ROBUST_05_FILES = { "qrels" : [ ("TREC2005.qrels.txt", "") ], "topics" : [ ( "05.50.topics.txt", "" ) ], "info_url" : "", } TREC_PRECISION_MEDICINE_FILES = { "topics" : { "2017" : ("topics2017.xml", "", "trecxml"), "2018" : ("topics2018.xml", "", "trecxml"), "2019" : ("topics2019.xml", "", "trecxml"), "2020" : ("topics2020.xml", "", "trecxml") }, "qrels" : { "qrels-2017-abstracts" : ("qrels-2017-abstracts.txt", ""), #TODO keep original names? "qrels-2017-abstracts-sample" : ("qrels-2017-abstracts-sample.txt", ""), "qrels-2017-trials" : ("qrels-2017-trials.txt", ""), "qrels-2018-abstracts" : ("qrels-2018-abstracts.txt", ""), "qrels-2018-abstracts-sample" : ("qrels-2018-abstracts-sample.txt", ""), "qrels-2018-trials" : ("qrels-2018-trials.txt", ""), "qrels-2018-trials-sample" : ("qrels-2018-trials-sample.txt", ""), "qrels-2019-abstracts" : ("qrels-2019-abstracts.txt", ""), "qrels-2019-trials" : ("qrels-2019-trials.txt", ""), "qrels-2019-abstracts-sample" : ("qrels-2019-abstracts-sample.txt", ""), "qrels-2019-trials-sample" : ("qrels-2019-trials-sample.txt", "") }, "info_url" : "", } VASWANI_CORPUS_BASE = "" VASWANI_INDEX_BASE = "" VASWANI_FILES = { "corpus": [("doc-text.trec", [ VASWANI_CORPUS_BASE + "corpus/doc-text.trec", irds_mirror("a059e713c50350e39999467c8c73b7c5")])], "topics": [("query-text.trec", [ VASWANI_CORPUS_BASE + "query-text.trec", irds_mirror("3a624be2b0ef7c9534cf848891679bec")])], "qrels": [("qrels", [ VASWANI_CORPUS_BASE + "qrels", irds_mirror("6acb6db9969da8b8c6c23c09551af8d9")])], "index": _datarepo_index_default_none, #"index": # [(filename, VASWANI_INDEX_BASE + filename) for filename in STANDARD_TERRIER_INDEX_FILES + ["data.meta-0.fsomapfile"]], "info_url" : "", "corpus_iter" : lambda dataset, **kwargs : pyterrier.index.treccollection2textgen(dataset.get_corpus(), num_docs=11429, verbose=kwargs.get("verbose", False)) } DATASET_MAP = { # used for UGlasgow teaching "50pct" : RemoteDataset("50pct", FIFTY_PCT_FILES), # umass antique corpus - see "antique" : RemoteDataset("antique", ANTIQUE_FILES), # generated from "vaswani": RemoteDataset("vaswani", VASWANI_FILES), "msmarco_document" : RemoteDataset("msmarco_document", MSMARCO_DOC_FILES), "msmarcov2_document" : RemoteDataset("msmarcov2_document", MSMARCOv2_DOC_FILES), "msmarco_passage" : RemoteDataset("msmarco_passage", MSMARCO_PASSAGE_FILES), "msmarcov2_passage" : RemoteDataset("msmarcov2_passage", MSMARCOv2_PASSAGE_FILES), "trec-robust-2004" : RemoteDataset("trec-robust-2004", TREC_ROBUST_04_FILES), "trec-robust-2005" : RemoteDataset("trec-robust-2005", TREC_ROBUST_05_FILES), "trec-terabyte" : RemoteDataset("trec-terabyte", TREC_TB_FILES), #medical-like tracks "trec-precision-medicine" : RemoteDataset("trec-precicion-medicine", TREC_PRECISION_MEDICINE_FILES), "trec-covid" : RemoteDataset("trec-covid", TREC_COVID_FILES), #wt2g "trec-wt2g" : RemoteDataset("trec-wt2g", TREC_WT2G_FILES), #wt10g "trec-wt10g" : RemoteDataset("trec-wt10g", TREC_WT10G_FILES), "trec-wt-2002" : RemoteDataset("trec-wt-2002", TREC_WT_2002_FILES), "trec-wt-2003" : RemoteDataset("trec-wt-2003", TREC_WT_2002_FILES), "trec-wt-2004" : RemoteDataset("trec-wt-2004", TREC_WT_2004_FILES), #clueweb09 "trec-wt-2009" : RemoteDataset("trec-wt-2009", TREC_WT_2009_FILES), "trec-wt-2010" : RemoteDataset("trec-wt-2010", TREC_WT_2010_FILES), "trec-wt-2011" : RemoteDataset("trec-wt-2011", TREC_WT_2011_FILES), "trec-wt-2012" : RemoteDataset("trec-wt-2012", TREC_WT_2012_FILES), } # Include all datasets from ir_datasets with "irds:" prefix so they don't conflict with pt dataset names # Results in records like: # irds:antique # irds:antique/test # irds:antique/test/non-offensive # irds:antique/train # ... import ir_datasets for ds_id in ir_datasets.registry: DATASET_MAP[f'irds:{ds_id}'] = IRDSDataset(ds_id, defer_load=True) # "trec-deep-learning-docs" #DATASET_MAP['msmarco_document'] = DATASET_MAP["trec-deep-learning-docs"] #DATASET_MAP['msmarco_passage'] = DATASET_MAP["trec-deep-learning-passages"] DATASET_MAP["trec-deep-learning-docs"] = DATASET_MAP['msmarco_document'] DATASET_MAP["trec-deep-learning-passages"] = DATASET_MAP['msmarco_passage']
[docs]def get_dataset(name, **kwargs): """ Get a dataset by name """ # Some datasets in ir_datasets are built on-the-fly (e.g., clirmatrix). # Handle this by allocating it on demand here. if name not in DATASET_MAP and name.startswith('irds:'): # remove irds: prefix ds_id = name[len('irds:'):] DATASET_MAP[name] = IRDSDataset(ds_id) rtr = DATASET_MAP[name] rtr._configure(**kwargs) return rtr
def datasets(): """ Lists all the names of the datasets """ return DATASET_MAP.keys()
[docs]def find_datasets(query, en_only=True): """ A grep-like method to help identify datasets. Filters the output of list_datasets() based on the name containing the query """ datasets = list_datasets(en_only=en_only) return datasets[datasets['dataset'].str.contains(query)]
[docs]def list_datasets(en_only=True): """ Returns a dataframe of all datasets, listing which topics, qrels, corpus files or indices are available. By default, filters to only datasets with both a corpus and topics in English. """ import pandas as pd rows=[] for k in datasets(): dataset = get_dataset(k) rows.append([ k, dataset._describe_component("topics"), dataset.get_topics_lang(), dataset._describe_component("qrels"), dataset._describe_component("corpus"), dataset.get_corpus_lang(), dataset._describe_component("index"), dataset.info_url() ]) result = pd.DataFrame(rows, columns=["dataset", "topics", "topics_lang", "qrels", "corpus", "corpus_lang", "index", "info_url"]) if en_only: topics_filter = (result['topics'].isnull()) | (result['topics_lang'] == 'en') corpus_filter = (result['corpus'].isnull()) | (result['corpus_lang'] == 'en') result = result[topics_filter & corpus_filter] return result