Source code for pyterrier.terrier.tokeniser

from enum import Enum
import pyterrier as pt

[docs] class TerrierTokeniser(Enum): """A built-in Terrier tokeniser. This enum is primarily used with :class:`~pyterrier.terrier.TerrierIndexer.indexer`. """ whitespace = 'whitespace' #: Tokenise on whitespace only english = 'english' #: Terrier's standard tokeniser, designed for English utf = 'utf' #: A variant of Terrier's standard tokeniser, similar to English, but with UTF support. twitter = 'twitter' #: Like utf, but keeps hashtags etc identity = 'identity' #: Performs no tokenisation - strings are kept as is. @staticmethod def _to_obj(this) -> 'TerrierTokeniser': try: return TerrierTokeniser(this) except ValueError: return this @staticmethod def _to_class(this) -> str: if this == TerrierTokeniser.whitespace: return 'WhitespaceTokeniser' if this == TerrierTokeniser.english: return 'EnglishTokeniser' if this == TerrierTokeniser.utf: return 'UTFTokeniser' if this == TerrierTokeniser.twitter: return 'UTFTwitterTokeniser' if this == TerrierTokeniser.identity: return 'IdentityTokeniser' if isinstance(this, str): return this raise ValueError(f'Unknown/unsupported tokeniser: {this}') @staticmethod @pt.java.required def java_tokeniser(this): clz = TerrierTokeniser._to_class(this) if "." not in clz: clz = "org.terrier.indexing.tokenisation." + clz return pt.java.autoclass(clz)()