Source code for pyterrier.terrier.stopwords
from enum import Enum
from typing import Union, List, Dict
import pyterrier as pt
[docs]
class TerrierStopwords(Enum):
"""The stopword configuration to use for Terrier.
This enum is primarily used with :class:`~pyterrier.terrier.TerrierIndexer.indexer`.
"""
none = 'none' #: No Stopwords
terrier = 'terrier' #: Apply Terrier's standard stopword list
custom = 'custom' #: Apply PyTerrierCustomStopwordList.Indexing for indexing, and PyTerrierCustomStopwordList.Retrieval for retrieval
@staticmethod
def _to_obj(this):
if isinstance(this, list):
rtr = TerrierStopwords('custom')
return rtr, list(this)
try:
return TerrierStopwords(this), None
except ValueError:
return this, None
@staticmethod
@pt.java.required
def _indexing_config(this, stopword_list : Union[List[str], None], termpipelines : List[str], properties : Dict[str,str], hooks : List):
if this is None or this == TerrierStopwords.none:
pass
if this == TerrierStopwords.terrier:
termpipelines.append('Stopwords')
if this == TerrierStopwords.custom:
cst = pt.java.cast
assert pt.terrier.check_version("5.8"), "Terrier 5.8 required"
assert stopword_list is not None, "expected to receive a stopword list"
stopword_list_esc = [t.replace(",", "\\,") for t in stopword_list ]
properties["pyterrier.stopwords"] = ",".join(stopword_list_esc)
termpipelines.append('org.terrier.python.PyTerrierCustomStopwordList$Indexing')
# this hook updates the index's properties to handle the python stopwords list
def _hook(pyindexer, index):
pindex = cst("org.terrier.structures.PropertiesIndex", index)
# store the stopwords into the Index's properties
pindex.setIndexProperty("pyterrier.stopwords", ",".join(stopword_list_esc))
# change the stopwords list implementation: the Indexing variant obtains
# stopwords from the global ApplicationSetup properties, while the
# Retrieval variant obtains them from the *Index* properties instead
pindex.setIndexProperty("termpipelines",
pindex.getIndexProperty('termpipelines', None)
.replace('org.terrier.python.PyTerrierCustomStopwordList$Indexing',
'org.terrier.python.PyTerrierCustomStopwordList$Retrieval'))
pindex.flush()
hooks.append(_hook)