Class: LangExtract::Core::RegexTokenizer

Inherits:
Object
  • Object
show all
Defined in:
lib/langextract/core/tokenizer.rb

Constant Summary collapse

DEFAULT_PATTERN =
/\S+/

Instance Method Summary collapse

Constructor Details

#initialize(pattern: DEFAULT_PATTERN) ⇒ RegexTokenizer

Returns a new instance of RegexTokenizer.



20
21
22
# File 'lib/langextract/core/tokenizer.rb', line 20

def initialize(pattern: DEFAULT_PATTERN)
  @pattern = pattern
end

Instance Method Details

#tokenize(text) ⇒ Object



24
25
26
27
28
29
30
31
32
33
34
35
# File 'lib/langextract/core/tokenizer.rb', line 24

def tokenize(text)
  tokens = []
  text.to_enum(:scan, @pattern).each do
    match = Regexp.last_match
    tokens << Token.new(
      text: match[0],
      char_interval: CharInterval.new(start_pos: match.begin(0), end_pos: match.end(0)),
      index: tokens.length
    )
  end
  tokens.freeze
end