Source code for pyterrier.terrier.stemmer

from enum import Enum
import pyterrier as pt


_stemmer_cache = {}


[docs] class TerrierStemmer(Enum): """A built-in Terrier stemmer. The stemming configuration is saved in the index and loaded at retrieval time. `Snowball <https://snowballstem.org/>`_ stemmers for various languages `are available in Terrier <http://terrier.org/docs/current/javadoc/org/terrier/terms/package-summary.html>`_. This enum is primarily used with :class:`~pyterrier.terrier.TerrierIndexer.indexer`. """ none = 'none' #: Apply no stemming porter = 'porter' #: Apply Porter's English stemmer weakporter = 'weakporter' #: Apply a weak version of Porter's English stemmer # available snowball stemmers in Terrier danish = 'danish' #: Snowball Danish stemmer finnish = 'finnish' #: Snowball Finnish stemmer german = 'german' #: Snowball German stemmer hungarian = 'hungarian' #: Snowball Hungarian stemmer norwegian = 'norwegian' #: Snowball Norwegian stemmer portugese = 'portugese' #: Snowball Portuguese stemmer spanish = 'spanish' #: Snowball Spanish stemmer swedish = 'swedish' #: Snowball Swedish stemmer turkish = 'turkish' #: Snowball Turkish stemmer @staticmethod def _to_obj(this): try: return TerrierStemmer(this) except ValueError: return this @staticmethod def _to_class(this): if this is None or this == TerrierStemmer.none: return None if this == TerrierStemmer.porter: return 'PorterStemmer' if this == TerrierStemmer.weakporter: return 'WeakPorterStemmer' # snowball stemmers if this == TerrierStemmer.danish: return 'DanishSnowballStemmer' if this == TerrierStemmer.finnish: return 'FinnishSnowballStemmer' if this == TerrierStemmer.german: return 'GermanSnowballStemmer' if this == TerrierStemmer.hungarian: return 'HungarianSnowballStemmer' if this == TerrierStemmer.norwegian: return 'NorwegianSnowballStemmer' if this == TerrierStemmer.portugese: return 'PortugueseSnowballStemmer' if this == TerrierStemmer.spanish: return 'SpanishSnowballStemmer' if this == TerrierStemmer.swedish: return 'SwedishSnowballStemmer' if this == TerrierStemmer.turkish: return 'TurkishSnowballStemmer' if isinstance(this, str): return this
[docs] @pt.java.required def stem(self, tok): """Stem a single token using this stemmer. .. code-block:: python stemmer = pt.TerrierStemmer.porter stemmed_word = stemmer.stem('abandoned') """ if self not in _stemmer_cache: clz_name = self._to_class(self) if clz_name is None: _stemmer_cache[self] = _NoOpStem() else: if '.' not in clz_name: clz_name = f'org.terrier.terms.{clz_name}' # stemmers are termpipeline objects, and these have chained constructors # pass None to use the appropriate constructor _stemmer_cache[self] = pt.java.autoclass(clz_name)(None) return _stemmer_cache[self].stem(tok)
class _NoOpStem(): def stem(self, word): return word