Source code for pyterrier.terrier.index_factory

from typing import Union, List
import pyterrier as pt


[docs] @pt.java.required class IndexFactory: """ The ``of()`` method of this factory class allows to load a Terrier `Index <http://terrier.org/docs/current/javadoc/org/terrier/structures/Index.html>`_. NB: This class "shades" the native Terrier `IndexFactory <http://terrier.org/docs/current/javadoc/org/terrier/structures/IndexFactory.html>`_ class - it offers essential the same API, except that the ``of()`` method contains a memory kwarg, that can be used to load additional index data structures into memory. Terrier data structures that can be loaded into memory: - 'inverted' - the inverted index, contains posting lists for each term. In the default configuration, this is read in from disk in chunks. - 'lexicon' - the dictionary. By default, a binary search of the on-disk structure is used, so loading into memory can enhance speed. - 'meta' - metadata about documents. Used as the final stage of retrieval, one seek for each retrieved document. - 'direct' - contains posting lists for each document. No speed advantage for loading into memory unless pseudo-relevance feedback is being used. - 'document' - contains document lengths, which are anyway loaded into memory. No speed advantage for loading into memory unless pseudo-relevance feedback is being used. """ @staticmethod def _load_into_memory(index, structures=['lexicon', 'direct', 'inverted', 'meta'], load=False): REWRITES = { 'meta' : { # both metaindex implementations have the same property 'org.terrier.structures.ZstdCompressedMetaIndex' : { 'index.meta.index-source' : 'fileinmem', 'index.meta.data-source' : 'fileinmem'}, 'org.terrier.structures.CompressingMetaIndex' : { 'index.meta.index-source' : 'fileinmem', 'index.meta.data-source' : 'fileinmem'} }, 'lexicon' : { 'org.terrier.structures.FSOMapFileLexicon' : { 'index.lexicon.data-source' : 'fileinmem' } }, 'direct' : { 'org.terrier.structures.bit.BitPostingIndex' : { 'index.direct.data-source' : 'fileinmem'} }, 'inverted' : { 'org.terrier.structures.bit.BitPostingIndex' : { 'index.inverted.data-source' : 'fileinmem'} }, } if "direct" in structures: REWRITES['document'] = { # we have to be sensitive to the presence of fields or not # NB: loading these structures into memory only benefit direct index access 'org.terrier.structures.FSADocumentIndex' : { 'index.document.class' : 'FSADocumentIndexInMem' }, 'org.terrier.structures.FSAFieldDocumentIndex' : { 'index.document.class' : 'FSADocumentIndexInMemFields' } } pindex = pt.java.cast("org.terrier.structures.IndexOnDisk", index) dirty_structures = set() for s in structures: if not pindex.hasIndexStructure(s): continue clz = pindex.getIndexProperty(f"index.{s}.class", "notfound") if clz not in REWRITES[s]: raise ValueError(f"Cannot load structure {s} into memory, underlying class {clz} is not supported") # we only reload an index structure if a property has changed for k, v in REWRITES[s][clz].items(): if pindex.getIndexProperty(k, "notset") != v: pindex.setIndexProperty(k, v) dirty_structures.add(s) # if the document index is reloaded, the inverted index should be reloaded too # NB: the direct index needs reloaded too, but this option is only available IF # the direct index is setup if s == "document": dirty_structures.add("inverted") # remove the old data structures from memory for s in dirty_structures: if pindex.structureCache.containsKey(s): pindex.structureCache.remove(s) # force the index structures to be loaded now if load: for s in dirty_structures: pindex.getIndexStructure(s) # dont allow the index properties to be rewritten pindex.dirtyProperties = False return index
[docs] @staticmethod def of(indexlike, memory : Union[bool, List[str]] = False): """ Loads an index. Returns a Terrier `Index <http://terrier.org/docs/current/javadoc/org/terrier/structures/Index.html>`_ object. :param indexlike: The location of the index. This can be a string, or an `IndexRef <http://terrier.org/docs/current/javadoc/org/terrier/querying/IndexRef.html>`__ object. :param memory: If the index should be loaded into memory. Use `True` for all structures, or a list of structure names. :return: A Terrier `Index <http://terrier.org/docs/current/javadoc/org/terrier/structures/Index.html>`_ object. """ load_profile = pt.terrier.J.IndexOnDisk.getIndexLoadingProfileAsRetrieval() if memory or (isinstance(memory, list) and len(memory) > 0): #MEMORY CAN BE A LIST? pt.terrier.J.IndexOnDisk.setIndexLoadingProfileAsRetrieval(False) index = pt.terrier.J.IndexFactory.of(indexlike) # noop if memory is False pt.terrier.J.IndexOnDisk.setIndexLoadingProfileAsRetrieval(load_profile) if not memory: return index if isinstance(memory, list): return IndexFactory._load_into_memory(index, structures=memory) return IndexFactory._load_into_memory(index)