Source code for pyterrier.datasets

import urllib.request
import os
import json
import pandas as pd
from .transformer import is_lambda
from abc import abstractmethod
import types
from collections import defaultdict
from typing import Union, Tuple, Dict, List, Literal, Optional
from warnings import warn
import requests
from .io import autoopen, touch
import pyterrier as pt
import tarfile
import zipfile
import ir_datasets


TERRIER_DATA_BASE="http://data.terrier.org/indices/"
STANDARD_TERRIER_INDEX_FILES = [
    "data.direct.bf",
    "data.document.fsarrayfile",
    "data.inverted.bf",
    "data.lexicon.fsomapfile",
    "data.lexicon.fsomaphash",
    "data.lexicon.fsomapid",
    "data.meta.idx",
    "data.meta.zdata",
    "data.properties"
]

class GeneratorLen(object):
    def __init__(self, gen, length):
        self.gen = gen
        self.length = length

    def __len__(self): 
        return self.length

    def __iter__(self):
        return self.gen

[docs] class Dataset(): """ Represents a dataset (test collection) for indexing or retrieval. A common use-case is to use the Dataset within an Experiment:: dataset = pt.get_dataset("trec-robust-2004") pt.Experiment([br1, br2], dataset.get_topics(), dataset.get_qrels(), eval_metrics=["map", "recip_rank"]) """ def _configure(self, **kwargs): pass
[docs] def get_corpus(self): """ Returns the location of the files to allow indexing the corpus, i.e. it returns a list of filenames. """ pass
[docs] @abstractmethod def get_corpus_iter(self, verbose=True) -> pt.model.IterDict: """ Returns an iter of dicts for this collection. If verbose=True, a tqdm pbar shows the progress over this iterator. """ pass
[docs] def get_corpus_lang(self) -> Union[str,None]: """ Returns the ISO 639-1 language code for the corpus, or None for multiple/other/unknown """ return None
[docs] def get_index(self, variant=None, **kwargs): """ Returns the IndexRef of the index to allow retrieval. Only a few datasets provide indices ready made. """ pass
[docs] @abstractmethod def get_topics(self, variant=None) -> pd.DataFrame: """ Returns the topics, as a dataframe, ready for retrieval. """ pass
[docs] def get_topics_lang(self) -> Union[str,None]: """ Returns the ISO 639-1 language code for the topics, or None for multiple/other/unknown """ return None
[docs] @abstractmethod def get_qrels(self, variant=None) -> pd.DataFrame: """ Returns the qrels, as a dataframe, ready for evaluation. """ pass
[docs] def get_topicsqrels(self, variant=None) -> Tuple[pd.DataFrame,pd.DataFrame]: """ Returns both the topics and qrels in a tuple. This is useful for pt.Experiment(). """ return ( self.get_topics(variant=variant), self.get_qrels(variant=variant) )
[docs] def info_url(self): """ Returns a url that provides more information about this dataset. """ return None
[docs] def get_results(self, variant=None) -> pd.DataFrame: """ Returns a standard result set provided by the dataset. This is useful for re-ranking experiments. """ return None
class RemoteDataset(Dataset): def __init__(self, name, locations): self.locations = locations self.name = name self.user = None self.password = None def _configure(self, **kwargs): self.corpus_home = os.path.join(pt.io.pyterrier_home(), "corpora", self.name) if 'user' in kwargs: self.user = kwargs['user'] self.password = kwargs['password'] @staticmethod def download(URLs : Union[str,List[str]], filename : str, **kwargs): basename = os.path.basename(filename) if isinstance(URLs, str): URLs = [URLs] finalattempt=len(URLs)-1 error = None for i, url in enumerate(URLs): try: r = requests.get(url, allow_redirects=True, stream=True, **kwargs) r.raise_for_status() total = int(r.headers.get('content-length', 0)) with pt.io.finalized_open(filename, 'b') as file, pt.tqdm( desc=basename, total=total, unit='iB', unit_scale=True, unit_divisor=1024, ) as bar: for data in r.iter_content(chunk_size=1024): size = file.write(data) bar.update(size) break except Exception as e: if error is not None: e.__cause__ = error # chain errors to show all if fails error = e if i == finalattempt: raise error else: warn( "Problem fetching %s, resorting to next mirror" % url) def _check_variant(self, component, variant=None): name=self.name if component not in self.locations: raise ValueError("No %s in dataset %s" % (component, name)) if variant is None: if not isinstance(self.locations[component], list): raise ValueError("For %s in dataset %s, you must specify a variant. Available are: %s" % (component, name, str(list(self.locations[component].keys())))) else: if isinstance(self.locations[component], list): raise ValueError("For %s in dataset %s, there are no variants, but you specified %s" % (component, name, variant)) if variant not in self.locations[component]: raise ValueError("For %s in dataset %s, there is no variant %s. Available are: %s" % (component, name, variant, str(list(self.locations[component].keys())))) def _get_one_file(self, component, variant=None): filetype=None name=self.name self._check_variant(component, variant) location = self.locations[component][0] if variant is None else self.locations[component][variant] if is_lambda(location) or isinstance(location, types.FunctionType): argcount = location.__code__.co_argcount if argcount == 0: return location() elif argcount == 3: return location(self, component, variant) else: raise TypeError("Expected function with 0 or 3 arguments for %s %s %s" % (component, name, variant)) local = location[0] URL = location[1] if len(location) > 2: filetype = location[2] if not os.path.exists(self.corpus_home): os.makedirs(self.corpus_home) local = os.path.join(self.corpus_home, local) actualURL = URL if isinstance(URL, str) else URL[0] if "#" in actualURL and not os.path.exists(local): tarname, intarfile = actualURL.split("#") assert "/" not in intarfile assert ".tar" in tarname or ".tgz" in tarname or ".zip" in tarname localtarfile, _ = self._get_one_file("tars", tarname) extractor = zipfile.ZipFile if ".zip" in tarname else tarfile.open with extractor(localtarfile, "r") as tarobj: tarobj.extract(intarfile, path=self.corpus_home) os.rename(os.path.join(self.corpus_home, intarfile), local) return (local, filetype) if not os.path.exists(local): try: print("Downloading %s %s to %s" % (self.name, component, local)) kwargs = {} if self.user is not None: kwargs["auth"]=(self.user, self.password) RemoteDataset.download(URL, local, **kwargs) except urllib.error.HTTPError as he: raise ValueError("Could not fetch " + URL) from he return (local, filetype) def _get_all_files(self, component, variant=None, **kwargs): if variant is None: localDir = os.path.join(self.corpus_home, component) else: localDir = os.path.join(self.corpus_home, component, variant) kwargs = {} if self.user is not None: kwargs["auth"]=(self.user, self.password) direxists = os.path.exists(localDir) location = self.locations[component] if is_lambda(location) or isinstance(location, types.FunctionType): # functions are expensive to call, normally another HTTP is needed. # just assume we have everthing we need if we have the local directory already # and it contains a .complete file. if direxists and os.path.exists(os.path.join(localDir, ".complete")): return localDir # call the function, and get the file list file_list = location(self, component, variant, **kwargs) else: file_list = self.locations[component] if variant is None else self.locations[component][variant] if not direxists: os.makedirs(localDir) print("Downloading %s %s to %s" % (self.name, component, localDir)) # check for how much space is required and available space def _totalsize(file_list): total = -1 for f in file_list: if len(f) > 2: total += f[2] if total != -1: total += 1 return total totalsize = _totalsize(file_list) if totalsize > 0: import shutil total, used, free = shutil.disk_usage(localDir) if free < totalsize: raise ValueError("Insufficient freedisk space at %s to download index" % localDir) if totalsize > 2 * 2**30: warn( "Downloading index of > 2GB.") # all tarfiles that we will need to process tarfiles = defaultdict(list) for fileentry in file_list: local = fileentry[0] URL = fileentry[1] assert "/" not in local, "cant handle / in %s, local name is %s" % (local) expectedlength = -1 if len(fileentry) == 3: expectedlength = fileentry[2] local = os.path.join(localDir, local) # if file exists and we know length, check if dowload is complete fileexists = os.path.exists(local) if fileexists and expectedlength >= 0: length = os.stat(local).st_size if expectedlength != length: warn( "Removing partial download of %s (expected %d bytes, found %d)" % (local, expectedlength, length )) os.remove(local) fileexists = False if not fileexists: if "#" in URL: tarname, intarfile = URL.split("#") assert ".tar" in tarname or ".tgz" in tarname or ".zip" in tarname, "I dont know how to decompress file %s" % tarname localtarfile, _ = self._get_one_file("tars", tarname) # append intarfile to the list of files to be extracted from localtarfile tarfiles[localtarfile].append((intarfile, local)) else: try: RemoteDataset.download(URL, local, **kwargs) except urllib.error.HTTPError as he: raise ValueError("Could not fetch " + URL) from he # verify file if exists if expectedlength >= 0: length = os.stat(local).st_size if expectedlength != length: raise ValueError("Failed download of %s to %s (expected %d bytes, found %d)" % (URL, local, expectedlength, length )) # now extract all required files from each tar file for localtarfile in tarfiles: extractor = zipfile.ZipFile if ".zip" in tarname else tarfile.open with extractor(localtarfile, "r") as tarobj: # 5 is abrtary threshold - if we have lots of files to extract, give a progress bar. alternative would be delay=5? iter = pt.tqdm(tarfiles[localtarfile], unit="file", desc="Extracting from " + localtarfile) if len(tarfiles[localtarfile]) > 5 else tarfiles[localtarfile] for (intarfile, local) in iter: tarobj.extract(intarfile, path=self.corpus_home) local = os.path.join(self.corpus_home, local) os.rename(os.path.join(self.corpus_home, intarfile), local) #TODO, files /could/ be recompressed here to save space, if not already compressed # finally, touch a file signifying that download has been completed touch(os.path.join(localDir, ".complete")) return localDir def _describe_component(self, component): if component not in self.locations: return None if isinstance(self.locations[component], list): return True if isinstance(self.locations[component], dict): return list(self.locations[component].keys()) return True def get_corpus(self, **kwargs): return list(filter(lambda f : not f.endswith(".complete"), pt.io.find_files(self._get_all_files("corpus", **kwargs)))) def get_corpus_iter(self, **kwargs): if "corpus_iter" not in self.locations: raise ValueError("Cannot supply a corpus iterator on dataset %s" % self.name) return self.locations["corpus_iter"](self, **kwargs) def get_corpus_lang(self): if 'corpus' in self.locations: return 'en' # all are english return None def get_qrels(self, variant=None): filename, type = self._get_one_file("qrels", variant) if type == "direct": return filename return pt.io.read_qrels(filename) def get_topics(self, variant=None, **kwargs): file, filetype = self._get_one_file("topics", variant) if filetype is None or filetype in pt.io.SUPPORTED_TOPICS_FORMATS: return pt.io.read_topics(file, format=filetype, **kwargs) elif filetype == "direct": return file raise ValueError("Unknown filetype %s for %s topics %s" % (filetype, self.name, variant)) def get_topics_lang(self): if 'topics' in self.locations: return 'en' # all are english return None def get_index(self, variant=None, **kwargs): if self.name == "50pct" and variant is None: variant="ex1" thedir = self._get_all_files("index", variant=variant, **kwargs) return thedir def __repr__(self): return "RemoteDataset for %s, with %s" % (self.name, str(list(self.locations.keys()))) def info_url(self): return self.locations['info_url'] if "info_url" in self.locations else None @pt.java.required def _pt_tokeniser(): tokeniser = pt.terrier.J.Tokenizer.getTokeniser() def pt_tokenise(text): return ' '.join(tokeniser.getTokens(text)) return pt_tokenise class IRDSDataset(Dataset): def __init__(self, irds_id, defer_load=False): self._irds_id = irds_id self._irds_ref = None if defer_load else ir_datasets.load(self._irds_id) def irds_ref(self): if self._irds_ref is None: self._irds_ref = ir_datasets.load(self._irds_id) return self._irds_ref def get_corpus(self): raise NotImplementedError("IRDSDataset doesn't support get_corpus; use get_corpus_iter instead. If you " "are indexing, get_corpus_iter should be used in conjunction with IterDictIndexer.") def get_corpus_iter(self, verbose=True, start=0, count=None): ds = self.irds_ref() assert ds.has_docs(), f"{self._irds_id} doesn't support get_corpus_iter" it = ds.docs_iter() total = ds.docs_count() # use slicing if requested if start > 0 or count is not None: if count is not None: it = it[start:start+count] total = count else: it = it[start:] total -= start # tqdm support if verbose: it = pt.tqdm(it, desc=f'{self._irds_id} documents', total=total) # rewrite to follow pyterrier std def gen(): for doc in it: doc = doc._asdict() # pyterrier uses "docno" doc['docno'] = doc.pop('doc_id') yield doc # ensure we can provide accurate len return GeneratorLen(gen(), total) def get_corpus_lang(self): ds = self.irds_ref() if ds.has_docs(): return ds.docs_lang() return None def get_index(self, variant=None): # this is only for indices where Terrier provides an index already raise NotImplementedError("IRDSDataset doesn't support get_index") def get_topics(self, variant=None, tokenise_query=True): """ Returns the topics, as a dataframe, ready for retrieval. """ ds = self.irds_ref() assert ds.has_queries(), f"{self._irds_id} doesn't support get_topics" qcls = ds.queries_cls() assert variant is None or variant in qcls._fields[1:], f"{self._irds_id} only supports the following topic variants {qcls._fields[1:]}" df = pd.DataFrame(ds.queries_iter()) df.rename(columns={"query_id": "qid"}, inplace=True) # pyterrier uses "qid" if variant is not None: # Some datasets have a query field called "query". We need to remove it or # we'll end up with multiple "query" columns, which will cause problems # because many components are written assuming no columns have the same name. if variant != 'query' and 'query' in df.columns: df.drop(columns=['query'], axis=1, inplace=True) df.rename(columns={variant: "query"}, inplace=True) # user specified which version of the query they want df.drop(columns=df.columns.difference(['qid','query']), axis=1, inplace=True) elif len(qcls._fields) == 2: # auto-rename single query field to "query" if there's only query_id and that field df.rename(columns={qcls._fields[1]: "query"}, inplace=True) else: print(f'There are multiple query fields available: {qcls._fields[1:]}. To use with pyterrier, provide variant or modify dataframe to add query column.') # apply pyterrier tokenisation (otherwise the queries may not play well with batchretrieve) if tokenise_query and 'query' in df: tokeniser = _pt_tokeniser() df['query'] = df['query'].apply(tokeniser) return df def get_topics_lang(self): ds = self.irds_ref() if ds.has_queries(): return ds.queries_lang() return None def get_qrels(self, variant=None): """ Returns the qrels, as a dataframe, ready for evaluation. """ ds = self.irds_ref() assert ds.has_qrels(), f"{self._irds_id} doesn't support get_qrels" qrelcls = ds.qrels_cls() qrel_fields = [f for f in qrelcls._fields if f not in ('query_id', 'doc_id', 'iteration')] assert variant is None or variant in qrel_fields, f"{self._irds_id} only supports the following qrel variants {qrel_fields}" df = pd.DataFrame(ds.qrels_iter()) # pyterrier uses "qid" and "docno" df.rename(columns={ "query_id": "qid", "doc_id": "docno"}, inplace=True) # pyterrier uses "label" if variant is not None: df.rename(columns={variant: "label"}, inplace=True) if len(qrel_fields) == 1: # usually "relevance" df.rename(columns={qrel_fields[0]: "label"}, inplace=True) elif 'relevance' in qrel_fields: print(f'There are multiple qrel fields available: {qrel_fields}. Defaulting to "relevance", but to use a different one, supply variant') df.rename(columns={'relevance': "label"}, inplace=True) else: print(f'There are multiple qrel fields available: {qrel_fields}. To use with pyterrier, provide variant or modify dataframe to add query column.') return df def get_results(self, variant=None) -> pd.DataFrame: """ Returns a standard result set provided by the dataset. This is useful for re-ranking experiments. """ ds = self.irds_ref() assert ds.has_scoreddocs(), f"{self._irds_id} doesn't support get_reranking_run" result = pd.DataFrame(ds.scoreddocs) result = result.rename(columns={'query_id': 'qid', 'doc_id': 'docno'}) # convert irds field names to pyterrier names result.sort_values(by=['qid', 'score', 'docno'], ascending=[True, False, True], inplace=True) # ensure data is sorted by qid, -score, did # result doesn't yet contain queries (only qids) so load and merge them in topics = self.get_topics(variant) result = pd.merge(result, topics, how='left', on='qid') return result def _describe_component(self, component): ds = self.irds_ref() if component == "topics": if ds.has_queries(): fields = ds.queries_cls()._fields[1:] if len(fields) > 1: return list(fields) return True return None if component == "qrels": if ds.has_qrels(): fields = [f for f in ds.qrels_cls()._fields if f not in ('query_id', 'doc_id', 'iteration')] if len(fields) > 1: return list(fields) return True return None if component == "corpus": return ds.has_docs() or None if component == "results": return ds.has_scoreddocs() or None return None def info_url(self): top_id = self._irds_id.split('/', 1)[0] suffix = f'#{self._irds_id}' if top_id != self._irds_id else '' return f'https://ir-datasets.com/{top_id}.html{suffix}' def __repr__(self): return f"IRDSDataset({repr(self._irds_id)})" def text_loader( self, fields: Union[List[str], str, Literal['*']] = '*', *, verbose: bool = False, ) -> pt.Transformer: """Create a transformer that loads text fields from an ir_datasets dataset into a DataFrame by docno. Args: fields: The fields to load from the dataset. If '*', all fields will be loaded. verbose: Whether to print debug information. """ return IRDSTextLoader(self, fields, verbose=verbose) class IRDSTextLoader(pt.Transformer): """A transformer that loads text fields from an ir_datasets dataset into a DataFrame by docno.""" def __init__( self, dataset: IRDSDataset, fields: Union[List[str], str, Literal['*']] = '*', *, verbose=False ): """Initialise the transformer with the index to load metadata from. Args: dataset: The dataset to load text from. fields: The fields to load from the dataset. If '*', all fields will be loaded. verbose: Whether to print debug information. """ if not dataset.irds_ref().has_docs(): raise ValueError(f"Dataset {dataset} does not provide docs") docs_cls = dataset.irds_ref().docs_cls() available_fields = [f for f in docs_cls._fields if f != 'doc_id' and docs_cls.__annotations__[f] is str] if fields == '*': fields = available_fields else: if isinstance(fields, str): fields = [fields] missing_fields = set(fields) - set(available_fields) if missing_fields: raise ValueError(f"Dataset {dataset} did not have requested metaindex keys {list(missing_fields)}. " f"Keys present in metaindex are {available_fields}") self.dataset = dataset self.fields = fields self.verbose = verbose def transform(self, inp: pd.DataFrame) -> pd.DataFrame: """Load text fields from the dataset into the input DataFrame. Args: inp: The input DataFrame. Must contain 'docno'. Returns: A new DataFrame with the text columns appended. """ if 'docno' not in inp.columns: raise ValueError(f"input missing 'docno' column, available columns: {list(inp.columns)}") irds = self.dataset.irds_ref() docstore = irds.docs_store() docnos = inp.docno.values.tolist() # Load the new data fields = ['doc_id'] + self.fields set_docnos = set(docnos) it = (tuple(getattr(doc, f) for f in fields) for doc in docstore.get_many_iter(set_docnos)) if self.verbose: it = pt.tqdm(it, unit='d', total=len(set_docnos), desc='IRDSTextLoader') metadata = pd.DataFrame(list(it), columns=fields).set_index('doc_id') metadata_frame = metadata.loc[docnos].reset_index(drop=True) # append the input and metadata frames inp = inp.drop(columns=self.fields, errors='ignore') # make sure we don't end up with duplicates inp = inp.reset_index(drop=True) # reset the index to default (matching metadata_frame) return pd.concat([inp, metadata_frame], axis='columns') def passage_generate(dataset): for filename in dataset.get_corpus(): with autoopen(filename, 'rt') as corpusfile: for line in corpusfile: #for each line docno, passage = line.split("\t") yield {'docno' : docno, 'text' : passage} def _datarepo_index(self, component, variant=None, version='latest', **kwargs): if variant is None: raise ValueError(f"Must specify index variant for {self.name}. See http://data.terrier.org/{self.name}.dataset.html") urlprefix= f"http://data.terrier.org/indices/{self.name}/{variant}/{version}/" url = urlprefix + "files" try: r = requests.get(url, **kwargs) r.raise_for_status() file = r.text.splitlines() except Exception as e: raise ValueError(f"Could not find index variant {variant} for dataset {self.name} at {url}. See available variants at http://data.terrier.org/{self.name}.dataset.html") from e rtr = [] import re for linenum, line in enumerate(file): # skip comments if line.startswith("#"): continue try: (length, filename) = re.split(r"\s+", line.strip(), 2) rtr.append((filename, urlprefix+filename, int(length))) except Exception as e: raise ValueError(f"Could not parse {url} line {linenum} '{line}'") from e return rtr def _datarepo_index_default_none(self, component, variant=None, version='latest', **kwargs): """ For backward compatability with vaswani - use default for variant """ if variant is None: variant = 'terrier_stemmed' return _datarepo_index(self, component, variant=variant, version=version, **kwargs) ANTIQUE_FILES = { "topics" : { "train" : ("antique-train-queries.txt", "http://ciir.cs.umass.edu/downloads/Antique/antique-train-queries.txt", "singleline"), "test" : ("antique-test-queries.txt", "http://ciir.cs.umass.edu/downloads/Antique/antique-test-queries.txt", "singleline"), }, "qrels" : { "train" : ("antique-train.qrel", "http://ciir.cs.umass.edu/downloads/Antique/antique-train.qrel", "singleline"), "test" : ("antique-test.qrel", "http://ciir.cs.umass.edu/downloads/Antique/antique-test.qrel", "singleline"), }, "corpus" : [("antique-collection.txt", "http://ciir.cs.umass.edu/downloads/Antique/antique-collection.txt")], "info_url" : "https://ciir.cs.umass.edu/downloads/Antique/readme.txt", "corpus_iter" : passage_generate } TREC_COVID_FILES = { "topics" : { "round1" : ("topics-rnd1.xml", "https://ir.nist.gov/covidSubmit/data/topics-rnd1.xml", "trecxml"), "round2" : ("topics-rnd2.xml", "https://ir.nist.gov/covidSubmit/data/topics-rnd2.xml", "trecxml"), "round3" : ("topics-rnd3.xml", "https://ir.nist.gov/covidSubmit/data/topics-rnd3.xml", "trecxml"), "round4" : ("topics-rnd4.xml", "https://ir.nist.gov/covidSubmit/data/topics-rnd4.xml", "trecxml"), "round5" : ("topics-rnd5.xml", "https://ir.nist.gov/covidSubmit/data/topics-rnd5.xml", "trecxml"), }, "qrels" : { "round1" : ("qrels-rnd1.txt", "https://ir.nist.gov/covidSubmit/data/qrels-rnd1.txt"), "round2" : ("qrels-rnd2.txt", "https://ir.nist.gov/covidSubmit/data/qrels-rnd2.txt"), "round3" : ("qrels-rnd3.txt", "https://ir.nist.gov/covidSubmit/data/qrels-covid_d3_j2.5-3.txt"), "round3-cumulative" : ("qrels-rnd3-cumulative.txt", "https://ir.nist.gov/covidSubmit/data/qrels-covid_d3_j0.5-3.txt"), "round4" : ("qrels-rnd4.txt", "https://ir.nist.gov/covidSubmit/data/qrels-covid_d4_j3.5-4.txt"), "round4-cumulative" : ("qrels-rnd4-cumulative.txt", "https://ir.nist.gov/covidSubmit/data/qrels-covid_d4_j0.5-4.txt"), "round5" : ("qrels-covid_d5_j4.5-5.txt", "https://ir.nist.gov/covidSubmit/data/qrels-covid_d5_j4.5-5.txt"), }, "corpus" : { "round4": ("round4.tar.gz", "https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/historical_releases/cord-19_2020-06-19.tar.gz"), "round5": ("round5.tar.gz", "https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/historical_releases/cord-19_2020-07-16.tar.gz"), }, "docids" : { "docids-rnd3" : ("docids-rnd3.txt", "https://ir.nist.gov/covidSubmit/data/docids-rnd3.txt"), "docids-rnd4" : ("docids-rnd4.txt", "https://ir.nist.gov/covidSubmit/data/docids-rnd4.txt"), "docids-rnd5" : ("docids-rnd5.txt", "https://ir.nist.gov/covidSubmit/data/docids-rnd5.txt") }, "info_url" : "https://ir.nist.gov/covidSubmit/", "index": _datarepo_index } def msmarco_document_generate(dataset): for filename in dataset.get_corpus(variant="corpus-tsv"): with autoopen(filename, 'rt') as corpusfile: for line in corpusfile: #for each line docno, url, title, passage = line.split("\t") yield {'docno' : docno, 'url' : url, 'title' : title, 'text' : passage} MSMARCO_DOC_FILES = { "corpus" : [("msmarco-docs.trec.gz", "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-docs.trec.gz")], "corpus-tsv": [("msmarco-docs.tsv.gz", "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-docs.tsv.gz")], "topics" : { "train" : ("msmarco-doctrain-queries.tsv.gz", "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-doctrain-queries.tsv.gz", "singleline"), "dev" : ("msmarco-docdev-queries.tsv.gz", "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-docdev-queries.tsv.gz", "singleline"), "test" : ("msmarco-test2019-queries.tsv.gz", "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-test2019-queries.tsv.gz", "singleline"), "test-2020" : ("msmarco-test2020-queries.tsv.gz" , "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-test2020-queries.tsv.gz", "singleline"), 'leaderboard-2020' : ("docleaderboard-queries.tsv.gz" , "https://msmarco.z22.web.core.windows.net/msmarcoranking/docleaderboard-queries.tsv.gz", "singleline") }, "qrels" : { "train" : ("msmarco-doctrain-qrels.tsv.gz", "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-doctrain-qrels.tsv.gz"), "dev" : ("msmarco-docdev-qrels.tsv.gz", "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-docdev-qrels.tsv.gz"), "test" : ("2019qrels-docs.txt", "https://trec.nist.gov/data/deep/2019qrels-docs.txt"), "test-2020" : ("2020qrels-docs.txt", "https://trec.nist.gov/data/deep/2020qrels-docs.txt") }, "info_url" : "https://microsoft.github.io/msmarco/", "corpus_iter" : msmarco_document_generate, "index" : _datarepo_index } MSMARCO_PASSAGE_FILES = { "corpus" : [("collection.tsv", "collection.tar.gz#collection.tsv")], "topics" : { "train" : ("queries.train.tsv", "queries.tar.gz#queries.train.tsv", "singleline"), "dev" : ("queries.dev.tsv", "queries.tar.gz#queries.dev.tsv", "singleline"), "dev.small" : ("queries.dev.small.tsv", "collectionandqueries.tar.gz#queries.dev.small.tsv", "singleline"), "eval" : ("queries.eval.tsv", "queries.tar.gz#queries.eval.tsv", "singleline"), "eval.small" : ("queries.eval.small.tsv", "collectionandqueries.tar.gz#queries.eval.small.tsv", "singleline"), "test-2019" : ("msmarco-test2019-queries.tsv.gz", "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-test2019-queries.tsv.gz", "singleline"), "test-2020" : ("msmarco-test2020-queries.tsv.gz", "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-test2020-queries.tsv.gz", "singleline") }, "tars" : { "queries.tar.gz" : ("queries.tar.gz", "https://msmarco.z22.web.core.windows.net/msmarcoranking/queries.tar.gz"), "collection.tar.gz" : ("collection.tar.gz", "https://msmarco.z22.web.core.windows.net/msmarcoranking/collection.tar.gz"), "collectionandqueries.tar.gz" : ("collectionandqueries.tar.gz", "https://msmarco.z22.web.core.windows.net/msmarcoranking/collectionandqueries.tar.gz") }, "qrels" : { "train" : ("qrels.train.tsv", "https://msmarco.z22.web.core.windows.net/msmarcoranking/qrels.train.tsv"), "dev" : ("qrels.dev.tsv", "https://msmarco.z22.web.core.windows.net/msmarcoranking/qrels.dev.tsv"), "test-2019" : ("2019qrels-docs.txt", "https://trec.nist.gov/data/deep/2019qrels-pass.txt"), "test-2020" : ("2020qrels-docs.txt", "https://trec.nist.gov/data/deep/2020qrels-pass.txt"), "dev.small" : ("qrels.dev.small.tsv", "collectionandqueries.tar.gz#qrels.dev.small.tsv"), }, "info_url" : "https://microsoft.github.io/MSMARCO-Passage-Ranking/", "corpus_iter" : passage_generate, "index" : _datarepo_index } MSMARCOv2_DOC_FILES = { "info_url" : "https://microsoft.github.io/msmarco/TREC-Deep-Learning.html", "topics" : { "train" : ("docv2_train_queries.tsv", "https://msmarco.z22.web.core.windows.net/msmarcoranking/docv2_train_queries.tsv", "singleline"), "dev1" :("docv2_dev_queries.tsv", "https://msmarco.z22.web.core.windows.net/msmarcoranking/docv2_dev_queries.tsv", "singleline"), "dev2" :("docv2_dev2_queries.tsv", "https://msmarco.z22.web.core.windows.net/msmarcoranking/docv2_dev2_queries.tsv", "singleline"), "valid1" : ("msmarco-test2019-queries.tsv.gz" , "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-test2019-queries.tsv.gz", "singleline"), "valid2" : ("msmarco-test2020-queries.tsv.gz" , "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-test2020-queries.tsv.gz", "singleline"), "trec_2021" : ("2021_queries.tsv" , "https://msmarco.z22.web.core.windows.net/msmarcoranking/2021_queries.tsv", "singleline"), }, "qrels" : { "train" : ("docv2_train_qrels.tsv", "https://msmarco.z22.web.core.windows.net/msmarcoranking/docv2_train_qrels.tsv"), "dev1" :("docv2_dev_qrels.tsv", "https://msmarco.z22.web.core.windows.net/msmarcoranking/docv2_dev_qrels.tsv"), "dev2" :("docv2_dev2_qrels.tsv", "https://msmarco.z22.web.core.windows.net/msmarcoranking/docv2_dev2_qrels.tsv"), "valid1" : ("docv2_trec2019_qrels.txt.gz" , "https://msmarco.z22.web.core.windows.net/msmarcoranking/docv2_trec2019_qrels.txt.gz"), "valid2" : ("docv2_trec2020_qrels.txt.gz" , "https://msmarco.z22.web.core.windows.net/msmarcoranking/docv2_trec2020_qrels.txt.gz") }, "index" : _datarepo_index, } MSMARCOv2_PASSAGE_FILES = { "info_url" : "https://microsoft.github.io/msmarco/TREC-Deep-Learning.html", "topics" : { "train" : ("passv2_train_queries.tsv", "https://msmarco.z22.web.core.windows.net/msmarcoranking/passv2_train_queries.tsv", "singleline"), "dev1" : ("passv2_dev_queries.tsv", "https://msmarco.z22.web.core.windows.net/msmarcoranking/passv2_dev_queries.tsv", "singleline"), "dev2" : ("passv2_dev2_queries.tsv", "https://msmarco.z22.web.core.windows.net/msmarcoranking/passv2_dev2_queries.tsv", "singleline"), "trec_2021" : ("2021_queries.tsv" , "https://msmarco.z22.web.core.windows.net/msmarcoranking/2021_queries.tsv", "singleline"), }, "qrels" : { "train" : ("passv2_train_qrels.tsv" "https://msmarco.z22.web.core.windows.net/msmarcoranking/passv2_train_qrels.tsv"), "dev1" : ("passv2_dev_qrels.tsv", "https://msmarco.z22.web.core.windows.net/msmarcoranking/passv2_dev_qrels.tsv"), "dev2" : ("passv2_dev2_qrels.tsv", "https://msmarco.z22.web.core.windows.net/msmarcoranking/passv2_dev2_qrels.tsv"), }, "index" : _datarepo_index, } # remove WT- prefix from topics def remove_prefix(self, component, variant): topics_file, type = self._get_one_file("topics_prefixed", variant) if type in pt.io.SUPPORTED_TOPICS_FORMATS: topics = pt.io.read_topics(topics_file, type) else: raise ValueError("Unknown topic type %s" % type) topics["qid"] = topics.apply(lambda row: row["qid"].split("-")[1], axis=1) return (topics, "direct") # a function to fix the namedpage TREC Web tracks 2001 and 2002 def parse_desc_only(self, component, variant): file, type = self._get_one_file("topics_desc_only", variant=variant) topics = pt.io.read_topics(file, format="trec", whitelist=["DESC"], blacklist=None) topics["qid"] = topics.apply(lambda row: row["qid"].replace("NP", ""), axis=1) topics["qid"] = topics.apply(lambda row: row["qid"].replace("EP", ""), axis=1) return (topics, "direct") TREC_WT_2002_FILES = { "topics" : { "td" : ("webtopics_551-600.txt.gz", "https://trec.nist.gov/data/topics_eng/webtopics_551-600.txt.gz", "trec"), "np" : parse_desc_only }, "topics_desc_only" : { "np" : ("webnamed_page_topics.1-150.txt.gz", "https://trec.nist.gov/data/topics_eng/webnamed_page_topics.1-150.txt.gz", "trec") }, "qrels" : { "np" : ("qrels.named-page.txt.gz", "https://trec.nist.gov/data/qrels_eng/qrels.named-page.txt.gz"), "td" : ("qrels.distillation.txt.gz", "https://trec.nist.gov/data/qrels_eng/qrels.distillation.txt.gz") }, "info_url" : "https://trec.nist.gov/data/t11.web.html", } TREC_WT_2003_FILES = { "topics" : { "np" : ("webtopics_551-600.txt.gz", "https://trec.nist.gov/data/topics_eng/webtopics_551-600.txt.gz", "trec"), "td" : ("2003.distillation_topics.1-50.txt", "https://trec.nist.gov/data/topics_eng/2003.distillation_topics.1-50.txt", "trec"), }, "qrels" : { "np" : ("qrels.named-page.txt.gz", "https://trec.nist.gov/data/qrels_eng/qrels.named-page.txt.gz"), "td" : ("qrels.distillation.2003.txt", "https://trec.nist.gov/data/qrels_eng/qrels.distillation.2003.txt") }, "info_url" : "https://trec.nist.gov/data/t12.web.html", } def irds_mirror(md5): return f'http://mirror.ir-datasets.com/{md5}' def filter_on_qid_type(self, component, variant): if component == "topics": data = self.get_topics("all") elif component == "qrels": data = self.get_qrels("all") qid2type_file = self._get_one_file("topics_map")[0] qid2type = pd.read_csv(qid2type_file, names=["qid", "type"], sep=" ") qid2type["qid"] = qid2type.apply(lambda row: row["qid"].split("-")[1], axis=1) rtr = data.merge(qid2type[qid2type["type"] == variant], on=["qid"]) if len(rtr) == 0: raise ValueError("No such topic type '%s'" % variant) rtr.drop(columns=['type'], inplace=True) return (rtr, "direct") TREC_WT_2004_FILES = { "topics" : { "all" : remove_prefix, "np": filter_on_qid_type, "hp": filter_on_qid_type, "td": filter_on_qid_type, }, "topics_map" : [("04.topic-map.official.txt", [ "https://trec.nist.gov/data/web/04.topic-map.official.txt", irds_mirror("79737768b3be1aa07b14691aa54802c5"), "https://www.dcs.gla.ac.uk/~craigm/04.topic-map.official.txt" ] )], "topics_prefixed" : { "all" : ("Web2004.query.stream.trecformat.txt", [ "https://trec.nist.gov/data/web/Web2004.query.stream.trecformat.txt", irds_mirror("10821f7a000b8bec058097ede39570be"), "https://www.dcs.gla.ac.uk/~craigm/Web2004.query.stream.trecformat.txt"], "trec") }, "qrels" : { "hp" : filter_on_qid_type, "td" : filter_on_qid_type, "np" : filter_on_qid_type, "all" : ("04.qrels.web.mixed.txt", [ "https://trec.nist.gov/data/web/04.qrels.web.mixed.txt", irds_mirror("93daa0e4b4190c84e30d2cce78a0f674"), "https://www.dcs.gla.ac.uk/~craigm/04.qrels.web.mixed.txt"]) }, "info_url" : "https://trec.nist.gov/data/t13.web.html", } FIFTY_PCT_INDEX_BASE = "http://www.dcs.gla.ac.uk/~craigm/IR_HM/" FIFTY_PCT_FILES = { "index": { "ex2" : [(filename, FIFTY_PCT_INDEX_BASE + "index/" + filename) for filename in ["data.meta-0.fsomapfile"] + STANDARD_TERRIER_INDEX_FILES], "ex3" : [(filename, FIFTY_PCT_INDEX_BASE + "ex3/" + filename) for filename in ["data.meta-0.fsomapfile", "data-pagerank.oos"] + STANDARD_TERRIER_INDEX_FILES], }, "topics": { "training" : ("training.topics", FIFTY_PCT_INDEX_BASE + "topics/" + "training.topics", "trec"), "validation" : ("validation.topics", FIFTY_PCT_INDEX_BASE + "topics/" + "validation.topics", "trec"), }, "qrels": { "training" : ("training.qrels", FIFTY_PCT_INDEX_BASE + "topics/" + "training.qrels", "trec"), "validation" : ("validation.qrels", FIFTY_PCT_INDEX_BASE + "topics/" + "validation.qrels", "trec"), } } # a function for the TREC Web track 2009 qrels, to make prels into qrels def prel2qrel(self, component, variant): prel_file, _ = self._get_one_file("prels", variant) df = pd.read_csv(prel_file, sep=" ", names=["qid", "docno", "label", "oth1", "oth2"])[["qid", "docno", "label"]] df["qid"] = df["qid"].astype(str) df["docno"] = df["docno"].astype(str) return (df, "direct") TREC_WT_2009_FILES = { "topics" : [ remove_prefix ], "topics_prefixed" : [ ("wt09.topics.queries-only", "https://trec.nist.gov/data/web/09/wt09.topics.queries-only", "singleline") ], "qrels" : { "adhoc" : prel2qrel, "adhoc.catA" : prel2qrel, "adhoc.catB" : prel2qrel, }, "prels" : { "adhoc" : ("prels.1-50.gz", "https://trec.nist.gov/data/web/09/prels.1-50.gz"), "adhoc.catA" : ("prels.catA.1-50.gz", "https://trec.nist.gov/data/web/09/prels.catA.1-50.gz"), "adhoc.catB" : ("prels.catB.1-50.gz", "https://trec.nist.gov/data/web/09/prels.catB.1-50.gz") }, "info_url" : "https://trec.nist.gov/data/web09.html", } TREC_WT_2010_FILES = { "topics" : [ ("wt2010-topics.queries-only", "https://trec.nist.gov/data/web/10/wt2010-topics.queries-only", "singleline") ], "qrels" : { "adhoc" : ("qrels.adhoc", "https://trec.nist.gov/data/web/10/10.adhoc-qrels.final") }, "info_url" : "https://trec.nist.gov/data/web10.html", } TREC_WT_2011_FILES = { "topics" : [ ("queries.101-150.txt", "https://trec.nist.gov/data/web/11/queries.101-150.txt", "singleline") ], "qrels" : { "adhoc" : ("qrels.adhoc", "https://trec.nist.gov/data/web/11/qrels.adhoc") }, "info_url" : "https://trec.nist.gov/data/web2011.html", } TREC_WT_2012_FILES = { "topics" : [ ("queries.151-200.txt", "https://trec.nist.gov/data/web/12/queries.151-200.txt", "singleline") ], "qrels" : { "adhoc" : ("qrels.adhoc", "https://trec.nist.gov/data/web/12/qrels.adhoc") }, "info_url" : "https://trec.nist.gov/data/web2012.html", } TREC_WT2G_FILES = { "qrels" : [ ("qrels.trec8.small_web.gz", "https://trec.nist.gov/data/qrels_eng/qrels.trec8.small_web.gz") ], "topics" : [ ( "topics.401-450.gz", "https://trec.nist.gov/data/topics_eng/topics.401-450.gz" ) ], "info_url" : "https://trec.nist.gov/data/t8.web.html", } TREC_WT10G_FILES = { "qrels" : { "trec9" : ("qrels.trec9.main_web.gz", "https://trec.nist.gov/data/qrels_eng/qrels.trec9.main_web.gz"), "trec10-adhoc" : ("qrels.trec10.main_web.gz", "https://trec.nist.gov/data/qrels_eng/qrels.trec10.main_web.gz"), "trec10-hp" : ("qrels.trec10.entrypage.gz", "https://trec.nist.gov/data/qrels_eng/qrels.trec10.entrypage.gz"), }, "topics" : { "trec9" : ( "topics.451-500.gz", "https://trec.nist.gov/data/topics_eng/topics.451-500.gz" ), "trec10-adhoc" : ( "topics.501-550.txt", "https://trec.nist.gov/data/topics_eng/topics.501-550.txt" ), "trec10-hp" : parse_desc_only }, "topics_desc_only" : { "trec10-hp" : ( "entry_page_topics.1-145.txt", "https://trec.nist.gov/data/topics_eng/entry_page_topics.1-145.txt" ), }, "info_url" : "https://trec.nist.gov/data/t9.web.html", } def _merge_years(self, component, variant): MAP_METHOD = { "topics" : RemoteDataset.get_topics, "qrels" : RemoteDataset.get_qrels, } dfs = [] low, hi = variant.split("-") for y in range(int(low), int(hi)+1): df = MAP_METHOD[component](self, variant=str(y)) dfs.append(df) return (pd.concat(dfs), "direct") TREC_TB_FILES = { "topics" : { "2004" : ( "04topics.701-750.txt", "https://trec.nist.gov/data/terabyte/04/04topics.701-750.txt" ), "2005" : ( "04topics.701-750.txt", "https://trec.nist.gov/data/terabyte/05/05.topics.751-800.txt" ), "2006" : ( "06.topics.801-850.txt", "https://trec.nist.gov/data/terabyte/06/06.topics.801-850.txt" ), "2004-2006" : ("06.topics.701-850.txt", "https://trec.nist.gov/data/terabyte/06/06.topics.701-850.txt"), "2006-np" : ( "06.np_topics.901-1081.txt", "https://trec.nist.gov/data/terabyte/06/06.np_topics.901-1081.txt" ), "2005-np" : ( "05.np_topics.601-872.final.txt", "https://trec.nist.gov/data/terabyte/05/05.np_topics.601-872.final.txt") }, "qrels" : { "2004" : ( "04.qrels.12-Nov-04", "https://trec.nist.gov/data/terabyte/04/04.qrels.12-Nov-04"), "2005" : ( "05.adhoc_qrels", "https://trec.nist.gov/data/terabyte/05/05.adhoc_qrels"), "2006" : ( "qrels.tb06.top50", "https://trec.nist.gov/data/terabyte/06/qrels.tb06.top50"), "2004-2006" : _merge_years, "2005-np" : ( "05.np_qrels", "https://trec.nist.gov/data/terabyte/05/05.np_qrels"), "2006-np" : ( "qrels.tb06.np", "https://trec.nist.gov/data/terabyte/06/qrels.tb06.np"), }, "info_url" : "https://trec.nist.gov/data/terabyte.html" } TREC_ROBUST_04_FILES = { "qrels" : [ ("qrels.robust2004.txt", "https://trec.nist.gov/data/robust/qrels.robust2004.txt") ], "topics" : [ ( "04.testset.gz", "https://trec.nist.gov/data/robust/04.testset.gz" ) ], "info_url" : "https://trec.nist.gov/data/t13_robust.html", } TREC_ROBUST_05_FILES = { "qrels" : [ ("TREC2005.qrels.txt", "https://trec.nist.gov/data/robust/05/TREC2005.qrels.txt") ], "topics" : [ ( "05.50.topics.txt", "https://trec.nist.gov/data/robust/05/05.50.topics.txt" ) ], "info_url" : "https://trec.nist.gov/data/t14_robust.html", } TREC_PRECISION_MEDICINE_FILES = { "topics" : { "2017" : ("topics2017.xml", "http://www.trec-cds.org/topics2017.xml", "trecxml"), "2018" : ("topics2018.xml", "http://www.trec-cds.org/topics2018.xml", "trecxml"), "2019" : ("topics2019.xml", "http://www.trec-cds.org/topics2019.xml", "trecxml"), "2020" : ("topics2020.xml", "http://www.trec-cds.org/topics2020.xml", "trecxml") }, "qrels" : { "qrels-2017-abstracts" : ("qrels-2017-abstracts.txt", "https://trec.nist.gov/data/precmed/qrels-final-abstracts.txt"), #TODO keep original names? "qrels-2017-abstracts-sample" : ("qrels-2017-abstracts-sample.txt", "https://trec.nist.gov/data/precmed/sample-qrels-final-abstracts.txt"), "qrels-2017-trials" : ("qrels-2017-trials.txt", "https://trec.nist.gov/data/precmed/qrels-final-trials.txt"), "qrels-2018-abstracts" : ("qrels-2018-abstracts.txt", "https://trec.nist.gov/data/precmed/qrels-treceval-abstracts-2018-v2.txt"), "qrels-2018-abstracts-sample" : ("qrels-2018-abstracts-sample.txt", "https://trec.nist.gov/data/precmed/qrels-sample-abstracts-v2.txt"), "qrels-2018-trials" : ("qrels-2018-trials.txt", "https://trec.nist.gov/data/precmed/qrels-treceval-clinical_trials-2018-v2.txt"), "qrels-2018-trials-sample" : ("qrels-2018-trials-sample.txt", "https://trec.nist.gov/data/precmed/qrels-sample-trials-v2.txt"), "qrels-2019-abstracts" : ("qrels-2019-abstracts.txt", "https://trec.nist.gov/data/precmed/qrels-treceval-abstracts.2019.txt"), "qrels-2019-trials" : ("qrels-2019-trials.txt", "https://trec.nist.gov/data/precmed/qrels-treceval-trials.38.txt"), "qrels-2019-abstracts-sample" : ("qrels-2019-abstracts-sample.txt", "https://trec.nist.gov/data/precmed/qrels-sampleval-abstracts.2019.txt"), "qrels-2019-trials-sample" : ("qrels-2019-trials-sample.txt", "https://trec.nist.gov/data/precmed/qrels-sampleval-trials.38.txt") }, "info_url" : "https://trec.nist.gov/data/precmed.html", } VASWANI_CORPUS_BASE = "https://raw.githubusercontent.com/terrier-org/pyterrier/master/tests/fixtures/vaswani_npl/" VASWANI_INDEX_BASE = "https://raw.githubusercontent.com/terrier-org/pyterrier/master/tests/fixtures/index/" VASWANI_FILES = { "corpus": [("doc-text.trec", [ VASWANI_CORPUS_BASE + "corpus/doc-text.trec", irds_mirror("a059e713c50350e39999467c8c73b7c5")])], "topics": [("query-text.trec", [ VASWANI_CORPUS_BASE + "query-text.trec", irds_mirror("3a624be2b0ef7c9534cf848891679bec")])], "qrels": [("qrels", [ VASWANI_CORPUS_BASE + "qrels", irds_mirror("6acb6db9969da8b8c6c23c09551af8d9")])], "index": _datarepo_index_default_none, #"index": # [(filename, VASWANI_INDEX_BASE + filename) for filename in STANDARD_TERRIER_INDEX_FILES + ["data.meta-0.fsomapfile"]], "info_url" : "http://ir.dcs.gla.ac.uk/resources/test_collections/npl/", "corpus_iter" : lambda dataset, **kwargs : pt.index.treccollection2textgen(dataset.get_corpus(), num_docs=11429, verbose=kwargs.get("verbose", False)) } DATASET_MAP : Dict[str, Dataset] = { # used for UGlasgow teaching "50pct" : RemoteDataset("50pct", FIFTY_PCT_FILES), # umass antique corpus - see http://ciir.cs.umass.edu/downloads/Antique/ "antique" : RemoteDataset("antique", ANTIQUE_FILES), # generated from http://ir.dcs.gla.ac.uk/resources/test_collections/npl/ "vaswani": RemoteDataset("vaswani", VASWANI_FILES), "msmarco_document" : RemoteDataset("msmarco_document", MSMARCO_DOC_FILES), "msmarcov2_document" : RemoteDataset("msmarcov2_document", MSMARCOv2_DOC_FILES), "msmarco_passage" : RemoteDataset("msmarco_passage", MSMARCO_PASSAGE_FILES), "msmarcov2_passage" : RemoteDataset("msmarcov2_passage", MSMARCOv2_PASSAGE_FILES), "trec-robust-2004" : RemoteDataset("trec-robust-2004", TREC_ROBUST_04_FILES), "trec-robust-2005" : RemoteDataset("trec-robust-2005", TREC_ROBUST_05_FILES), "trec-terabyte" : RemoteDataset("trec-terabyte", TREC_TB_FILES), #medical-like tracks "trec-precision-medicine" : RemoteDataset("trec-precicion-medicine", TREC_PRECISION_MEDICINE_FILES), "trec-covid" : RemoteDataset("trec-covid", TREC_COVID_FILES), #wt2g "trec-wt2g" : RemoteDataset("trec-wt2g", TREC_WT2G_FILES), #wt10g "trec-wt10g" : RemoteDataset("trec-wt10g", TREC_WT10G_FILES), #.gov "trec-wt-2002" : RemoteDataset("trec-wt-2002", TREC_WT_2002_FILES), "trec-wt-2003" : RemoteDataset("trec-wt-2003", TREC_WT_2002_FILES), "trec-wt-2004" : RemoteDataset("trec-wt-2004", TREC_WT_2004_FILES), #clueweb09 "trec-wt-2009" : RemoteDataset("trec-wt-2009", TREC_WT_2009_FILES), "trec-wt-2010" : RemoteDataset("trec-wt-2010", TREC_WT_2010_FILES), "trec-wt-2011" : RemoteDataset("trec-wt-2011", TREC_WT_2011_FILES), "trec-wt-2012" : RemoteDataset("trec-wt-2012", TREC_WT_2012_FILES), } # Include all datasets from ir_datasets with "irds:" prefix so they don't conflict with pt dataset names # Results in records like: # irds:antique # irds:antique/test # irds:antique/test/non-offensive # irds:antique/train # ... for ds_id in ir_datasets.registry: DATASET_MAP[f'irds:{ds_id}'] = IRDSDataset(ds_id, defer_load=True) # "trec-deep-learning-docs" #DATASET_MAP['msmarco_document'] = DATASET_MAP["trec-deep-learning-docs"] #DATASET_MAP['msmarco_passage'] = DATASET_MAP["trec-deep-learning-passages"] DATASET_MAP["trec-deep-learning-docs"] = DATASET_MAP['msmarco_document'] DATASET_MAP["trec-deep-learning-passages"] = DATASET_MAP['msmarco_passage']
[docs] def get_dataset(name, **kwargs): """ Get a dataset by name """ # Some datasets in ir_datasets are built on-the-fly (e.g., clirmatrix). # Handle this by allocating it on demand here. if name not in DATASET_MAP and name.startswith('irds:'): # remove irds: prefix ds_id = name[len('irds:'):] DATASET_MAP[name] = IRDSDataset(ds_id) rtr = DATASET_MAP[name] rtr._configure(**kwargs) return rtr
def datasets(): """ Lists all the names of the datasets """ return DATASET_MAP.keys()
[docs] def find_datasets(query, en_only=True): """ A grep-like method to help identify datasets. Filters the output of list_datasets() based on the name containing the query """ datasets = list_datasets(en_only=en_only) return datasets[datasets['dataset'].str.contains(query)]
[docs] def list_datasets(en_only=True): """ Returns a dataframe of all datasets, listing which topics, qrels, corpus files or indices are available. By default, filters to only datasets with both a corpus and topics in English. """ import pandas as pd import os # we should supress any IRDS warning about deprecated datasets restore_env = os.environ.get("IR_DATASETS_SKIP_DEPRECATED_WARNING", None) try: os.environ['IR_DATASETS_SKIP_DEPRECATED_WARNING'] = 'true' rows=[] for k in datasets(): dataset = get_dataset(k) rows.append([ k, dataset._describe_component("topics"), dataset.get_topics_lang(), dataset._describe_component("qrels"), dataset._describe_component("corpus"), dataset.get_corpus_lang(), dataset._describe_component("index"), dataset.info_url() ]) finally: if restore_env is None: del os.environ['IR_DATASETS_SKIP_DEPRECATED_WARNING'] else: os.environ['IR_DATASETS_SKIP_DEPRECATED_WARNING'] = restore_env result = pd.DataFrame(rows, columns=["dataset", "topics", "topics_lang", "qrels", "corpus", "corpus_lang", "index", "info_url"]) if en_only: topics_filter = (result['topics'].isnull()) | (result['topics_lang'] == 'en') corpus_filter = (result['corpus'].isnull()) | (result['corpus_lang'] == 'en') result = result[topics_filter & corpus_filter] return result
def transformer_from_dataset( dataset : Union[str, Dataset], clz, variant: Optional[str] = None, version: str = 'latest', **kwargs) -> pt.Transformer: """Returns a Transformer instance of type ``clz`` for the provided index of variant ``variant``.""" if isinstance(dataset, str): dataset = get_dataset(dataset) if version != "latest": raise ValueError("index versioning not yet supported") if hasattr(dataset, 'get_index'): indexref = dataset.get_index(variant) else: raise ValueError('dataset doe not support get_index()') classname = clz.__name__ classnames = [classname] if classname == 'Retriever': # we need to look for BatchRetrieve.args.json for legacy support classnames.append('BatchRetrieve') for c in classnames: # now look for, e.g., BatchRetrieve.args.json file, which will define the args for Retriever, e.g. stemming indexdir = indexref #os.path.dirname(indexref.toString()) argsfile = os.path.join(indexdir, classname + ".args.json") if os.path.exists(argsfile): with pt.io.autoopen(argsfile, "rt") as f: args = json.load(f) # anything specified in kwargs of this methods overrides the .args.json file args.update(kwargs) kwargs = args return clz(indexref, **kwargs)