Class: LangExtract::Core::UnicodeTokenizer

Inherits:
Object
  • Object
show all
Defined in:
lib/langextract/core/tokenizer.rb

Constant Summary collapse

TOKEN_PATTERN =
/
  \p{L}[\p{L}\p{M}\p{N}_'-]* |
  \p{N}+(?:[.,]\p{N}+)* |
  [^\s]
/ux

Instance Method Summary collapse

Instance Method Details

#tokenize(text) ⇒ Object



45
46
47
# File 'lib/langextract/core/tokenizer.rb', line 45

def tokenize(text)
  RegexTokenizer.new(pattern: TOKEN_PATTERN).tokenize(text)
end