Source code for pyterrier.io

import os
import pandas as pd
from contextlib import contextmanager


def coerce_dataframe(obj):
    if isinstance(obj, pd.DataFrame):
        return obj
    import types
    if isinstance(obj, types.GeneratorType):
        #its a generator, lets assume it generates dataframes
        rtr=[]
        for x in obj:
            assert isinstance(x, pd.DataFrame)
            rtr.append(x)
        return pd.concat(rtr)

[docs]def autoopen(filename, mode='rb'): """ A drop-in for open() that applies automatic compression for .gz, .bz2 and .lz4 file extensions """ if filename.endswith(".gz"): import gzip return gzip.open(filename, mode) elif filename.endswith(".bz2"): import bz2 return bz2.open(filename, mode) elif filename.endswith(".lz4"): import lz4.frame return lz4.frame.open(filename, mode) return open(filename, mode)
[docs]def find_files(dir): """ Returns all the files present in a directory and its subdirectories Args: dir(str): The directory containing the files Returns: paths(list): A list of the paths to the files """ import os lst = [] files = [] for (dirpath, dirnames, filenames) in os.walk(dir, followlinks=True): for name in filenames: files.append(os.path.join(dirpath, name)) return sorted(files)
@contextmanager def _finalized_open_base(path, mode, open_fn): assert mode in ('b', 't') # must supply either binary or text mode path_tmp = '{}.tmp{}'.format(*os.path.splitext(path)) # add tmp before extension (needed for autoopen) # adapted from <https://github.com/allenai/ir_datasets/blob/master/ir_datasets/util/__init__.py#L34> try: with open_fn(path_tmp, f'x{mode}') as f: # open in exclusive write mode (raises error if already exists) yield f os.replace(path_tmp, path) # on success, move temp file to original path except: try: os.remove(path_tmp) except: pass # edge case: removing temp file failed. Ignore and just raise orig error raise
[docs]def finalized_open(path: str, mode: str): """ Opens a file for writing, but reverts it if there was an error in the process. Args: path(str): Path of file to open mode(str): Either t or b, for text or binary mode Example: Returns a contextmanager that provides a file object, so should be used in a "with" statement. E.g.:: with pt.io.finalized_open("file.txt", "t") as f: f.write("some text") # file.txt exists with contents "some text" If there is an error when writing, the file is reverted:: with pt.io.finalized_open("file.txt", "t") as f: f.write("some other text") raise Exception("an error") # file.txt remains unchanged (if existed, contents unchanged; if didn't exist, still doesn't) """ return _finalized_open_base(path, mode, open)
[docs]def finalized_autoopen(path: str, mode: str): """ Opens a file for writing with ``autoopen``, but reverts it if there was an error in the process. Args: path(str): Path of file to open mode(str): Either t or b, for text or binary mode Example: Returns a contextmanager that provides a file object, so should be used in a "with" statement. E.g.:: with pt.io.finalized_autoopen("file.gz", "t") as f: f.write("some text") # file.gz exists with contents "some text" If there is an error when writing, the file is reverted:: with pt.io.finalized_autoopen("file.gz", "t") as f: f.write("some other text") raise Exception("an error") # file.gz remains unchanged (if existed, contents unchanged; if didn't exist, still doesn't) """ return _finalized_open_base(path, mode, autoopen)
[docs]def ok_filename(fname) -> bool: """ Checks to see if a filename is valid. """ BAD_CHARS = ':"%/<>^|?' + os.sep for c in BAD_CHARS: if c in fname: return False return True
[docs]def touch(fname, mode=0o666, dir_fd=None, **kwargs): """ Eqiuvalent to touch command on linux. Implementation from https://stackoverflow.com/a/1160227 """ import os flags = os.O_CREAT | os.O_APPEND with os.fdopen(os.open(fname, flags=flags, mode=mode, dir_fd=dir_fd)) as f: os.utime(f.fileno() if os.utime in os.supports_fd else fname, dir_fd=None if os.supports_fd else dir_fd, **kwargs)
[docs]def read_results(filename, format="trec", topics=None, dataset=None, **kwargs): """ Reads a file into a results dataframe. Parameters: filename (str): The filename of the file to be read. Compressed files are handled automatically. A URL is also supported for the "trec" format. format (str): The format of the results file: one of "trec", "letor". Default is "trec". topics (None or pandas.DataFrame): If provided, will merge the topics to merge into the results. This is helpful for providing query text. Cannot be used in conjunction with dataset argument. dataset (None, str or pyterrier.datasets.Dataset): If provided, loads topics from the dataset (or dataset ID) and merges them into the results. This is helpful for providing query text. Cannot be used in conjunction with dataset topics. **kwargs (dict): Other arguments for the internal method Returns: dataframe with usual qid, docno, score columns etc Examples:: # a dataframe of results can be used directly in a pt.Experiment pt.Experiment( [ pt.io.read_results("/path/to/baselines-results.res.gz") ], topics, qrels, ["map"] ) # make a transformer from a results dataframe, include the query text first_pass = pt.Transformer.from_df( pt.io.read_results("/path/to/results.gz", topics=topics) ) # make a max_passage retriever based on a previously saved results max_passage = (first_pass >> pt.text.get_text(dataset) >> pt.text.sliding() >> pt.text.scorer() >> pt.text.max_passage() ) """ if not format in SUPPORTED_RESULTS_FORMATS: raise ValueError("Format %s not known, supported types are %s" % (format, str(SUPPORTED_RESULTS_FORMATS.keys()))) results = SUPPORTED_RESULTS_FORMATS[format][0](filename, **kwargs) if dataset is not None: assert topics is None, "Cannot provide both dataset and topics" if isinstance(dataset, str): import pyterrier as pt dataset = pt.get_dataset(dataset) topics = dataset.get_topics() if topics is not None: results = pd.merge(results, topics, how='left', on='qid') return results
def _read_results_letor(filename, labels=False): def _parse_line(l): # $line =~ s/(#.*)$//; # my $comment = $1; # my @parts = split /\s+/, $line; # my $label = shift @parts; # my %hash = map {split /:/, $_} @parts; # return ($label, $comment, %hash); import re import numpy as np line, comment = l.split("#") line = line.strip() parts = re.split(r'\s+|:', line) label = parts.pop(0) m = re.search(r'docno\s?=\s?(\S+)', comment) docno = m.group(1) kv = {} qid = None for i, k in enumerate(parts): if i % 2 == 0: if k == "qid": qid = parts[i+1] else: kv[int(k)] = float(parts[i+1]) features = np.array([kv[i] for i in sorted(kv.keys())]) return (label, qid, docno, features) with autoopen(filename, 'rt') as f: rows = [] for line in f: if line.startswith("#"): continue (label, qid, docno, features) = _parse_line(line) if labels: rows.append([qid, docno, features, label]) else: rows.append([qid, docno, features]) return pd.DataFrame(rows, columns=["qid", "docno", "features", "label"] if labels else ["qid", "docno", "features"]) def _read_results_trec(filename): results = [] df = pd.read_csv(filename, sep=r'\s+', names=["qid", "iter", "docno", "rank", "score", "name"], dtype={'qid': str, 'docno': str, 'rank': int, 'score': float}) df = df.drop(columns="iter") return df
[docs]def write_results(res, filename, format="trec", append=False, **kwargs): """ Write a results dataframe to a file. Parameters: res (DataFrame): A results dataframe, with usual columns of qid, docno etc filename (str): The filename of the file to be written. Compressed files are handled automatically. format (str): The format of the results file: one of "trec", "letor", "minimal" append (bool): Append to an existing file. Defaults to False. **kwargs (dict): Other arguments for the internal method Supported Formats: * "trec" -- output columns are $qid Q0 $docno $rank $score $runname, space separated * "letor" -- This follows the LETOR and MSLR datasets, in that output columns are $label qid:$qid [$fid:$value]+ # docno=$docno * "minimal": output columns are $qid $docno $rank, tab-separated. This is used for submissions to the MSMARCO leaderboard. """ if not format in SUPPORTED_RESULTS_FORMATS: raise ValueError("Format %s not known, supported types are %s" % (format, str(SUPPORTED_RESULTS_FORMATS.keys()))) # convert generators to results res = coerce_dataframe(res) return SUPPORTED_RESULTS_FORMATS[format][1](res, filename, append=append, **kwargs)
def _write_results_trec(res, filename, run_name="pyterrier", append=False): res_copy = res.copy()[["qid", "docno", "rank", "score"]] res_copy.insert(1, "Q0", "Q0") res_copy.insert(5, "run_name", run_name) res_copy.to_csv(filename, sep=" ", mode='a' if append else 'w', header=False, index=False) def _write_results_minimal(res, filename, run_name="pyterrier", append=False): res_copy = res.copy()[["qid", "docno", "rank"]] res_copy.to_csv(filename, sep="\t", mode='a' if append else 'w', header=False, index=False) def _write_results_letor(res, filename, qrels=None, default_label=0, append=False): if qrels is not None: res = res.merge(qrels, on=['qid', 'docno'], how='left').fillna(default_label) mode='wa' if append else 'wt' with autoopen(filename, mode) as f: for row in res.itertuples(): values = row.features label = row.label if qrels is not None else default_label feat_str = ' '.join( [ '%i:%f' % (i+1,values[i]) for i in range(len(values)) ] ) f.write("%d qid:%s %s # docno=%s\n" % (label, row.qid, feat_str, row.docno))
[docs]def read_topics(filename, format="trec", **kwargs): """ Reads a file containing topics. Parameters: filename(str): The filename of the topics file. A URL is supported for the "trec" and "singleline" formats. format(str): One of "trec", "trecxml" or "singleline". Default is "trec" Returns: pandas.Dataframe with columns=['qid','query'] both columns have type string Supported Formats: * "trec" -- an SGML-formatted TREC topics file. Delimited by TOP tags, each having NUM and TITLE tags; DESC and NARR tags are skipped by default. Control using whitelist and blacklist kwargs * "trecxml" -- a more modern XML formatted topics file. Delimited by topic tags, each having number tags. query, question and narrative tags are parsed by default. Control using tags kwarg. * "singeline" -- one query per line, preceeded by a space or colon. Tokenised by default, use tokenise=False kwargs to prevent tokenisation. """ if format is None: format = "trec" if not format in SUPPORTED_TOPICS_FORMATS: raise ValueError("Format %s not known, supported types are %s" % (format, str(SUPPORTED_TOPICS_FORMATS.keys()))) return SUPPORTED_TOPICS_FORMATS[format](filename, **kwargs)
def _read_topics_trec(file_path, doc_tag="TOP", id_tag="NUM", whitelist=["TITLE"], blacklist=["DESC","NARR"]): from jnius import autoclass from . import check_version assert check_version("5.3") trecquerysource = autoclass('org.terrier.applications.batchquerying.TRECQuery') tqs = trecquerysource( [file_path], doc_tag, id_tag, whitelist, blacklist, # help jnius select the correct constructor signature="([Ljava/lang/String;Ljava/lang/String;Ljava/lang/String;[Ljava/lang/String;[Ljava/lang/String;)V") topics_lst=[] while(tqs.hasNext()): topic = tqs.next() qid = tqs.getQueryId() topics_lst.append([qid,topic]) topics_dt = pd.DataFrame(topics_lst,columns=['qid','query']) return topics_dt def _read_topics_trecxml(filename, tags=["query", "question", "narrative"], tokenise=True): """ Parse a file containing topics in TREC-like XML format Args: filename(str): The path to the topics file Returns: pandas.Dataframe with columns=['qid','query'] """ import xml.etree.ElementTree as ET import pandas as pd tags=set(tags) topics=[] tree = ET.parse(filename) root = tree.getroot() from jnius import autoclass tokeniser = autoclass("org.terrier.indexing.tokenisation.Tokeniser").getTokeniser() for child in root.iter('topic'): try: qid = child.attrib["number"] except KeyError: qid = child.find("number").text query = "" for tag in child: if tag.tag in tags: query_text = tag.text if tokenise: query_text = " ".join(tokeniser.getTokens(query_text)) query += " " + query_text topics.append((str(qid), query.strip())) return pd.DataFrame(topics, columns=["qid", "query"]) def _read_topics_singleline(filepath, tokenise=True): """ Parse a file containing topics, one per line. This function uses Terrier, so supports reading direct from URLs. Args: file_path(str): The path to the topics file tokenise(bool): whether the query should be tokenised, using Terrier's standard Tokeniser. If you are using matchop formatted topics, this should be set to False. Returns: pandas.Dataframe with columns=['qid','query'] """ rows = [] from jnius import autoclass from . import check_version assert check_version("5.3") slqIter = autoclass("org.terrier.applications.batchquerying.SingleLineTRECQuery")(filepath, tokenise) for q in slqIter: rows.append([slqIter.getQueryId(), q]) return pd.DataFrame(rows, columns=["qid", "query"])
[docs]def read_qrels(file_path): """ Reads a file containing qrels (relevance assessments) Parameters: file_path(str): The path to the qrels file. A URL is also supported. Returns: pandas.Dataframe with columns=['qid','docno', 'label'] with column types string, string, and int """ df = pd.read_csv(file_path, sep=r'\s+', names=["qid", "iter", "docno", "label"], dtype={"qid": "str", "docno": "str"}) df = df.drop(columns="iter") return df
SUPPORTED_TOPICS_FORMATS = { "trec" : _read_topics_trec, "trecxml" : _read_topics_trecxml, "singleline": _read_topics_singleline } SUPPORTED_RESULTS_FORMATS = { "trec" : (_read_results_trec, _write_results_trec), "letor" : (_read_results_letor, _write_results_letor), "minimal" : (None, _write_results_minimal) }