Source code for pyterrier_anserini._indexer

import json
import os
from typing import Dict, Iterable, List, Literal, Union

import pyterrier as pt
import pyterrier_alpha as pta

from pyterrier_anserini import AnseriniIndex


[docs] @pt.java.required class AnseriniIndexer(pt.Indexer): """An indexer for Anserini indexes.""" def __init__(self, index: Union[AnseriniIndex, str], *, fields: Union[List[str], Literal['*']] = '*', verbose: bool = False ): """Initializes the indexer. Args: index: The index to index to. If a string, an AnseriniIndex object is created for the path. fields: The fields to index. If '*' (default), all fields are indexed. Otherwise, the values of the fields provided in this argumetn are concatenated and indexed. verbose: Whether to display a progress bar when indexing. """ self._index = index if isinstance(index, AnseriniIndex) else AnseriniIndex(index) self.fields = fields self.verbose = verbose __repr__ = pta.transformer_repr
[docs] def index(self, inp: Iterable[Dict]) -> pta.Artifact: """Indexes the input documents to the index. Args: inp: An iterable of documents to index. Returns: The index that was indexed to. """ assert not self._index.built() from pyserini.index.lucene import LuceneIndexer args = ['-index', self._index.path, '-storeContents', '-storeDocvectors'] indexer = LuceneIndexer(self._index.path, args=args) # create directory and metadata file if not os.path.exists(os.path.join(self._index.path, 'pt_meta.json')): os.makedirs(self._index.path, exist_ok=True) with open(os.path.join(self._index.path, 'pt_meta.json'), 'wt') as fout: json.dump({ 'type': 'sparse_index', 'format': 'anserini', 'package_hint': 'pyterrier-anserini', # TODO: other stuff (like stemmer used) in due course }, fout) if self.verbose: inp = pt.tqdm(inp, unit='docs', desc='AnseriniIndexer') for doc in inp: indexer.add_doc_dict(self._map_doc(doc)) # commit indexer.close() return self._index
def _map_doc(self, doc: Dict) -> Dict: if self.fields == '*': contents = '\n'.join(v for k, v in doc.items() if k != 'docno' and isinstance(v, str)) else: contents = '\n'.join(str(doc[k]) for k in self.fields) return { 'id': doc['docno'], 'contents': contents }