Class: Kotoshu::Languages::English::Tokenizer

Inherits:
Components::WhitespaceTokenizer show all
Defined in:
lib/kotoshu/languages/en/language.rb

Overview

English tokenizer with contraction handling.

Constant Summary collapse

CONTRACTIONS =
{
  "n't" => ['not', 'NEG'],
  "'ll" => ['will', 'MD'],
  "'ve" => ['have', 'VBP'],
  "'re" => ['are', 'VBP'],
  "'m" => ['am', 'VBP'],
  "'d" => ['would', 'MD'],
  "'s" => ['is', 'VBZ'],
  "'clock" => ['of', 'IN'],
}.freeze
WONT_EXCEPTION =
{ "won't" => ['will', 'not'] }.freeze
CANT_EXCEPTION =
{ "can't" => ['can', "'t"] }.freeze
POSSESSIVE_PATTERN =
/([A-Za-z]+)('s)(?=[A-Za-z]|$)/
CONTRACTION_WITH_S =
%w[it he that what who there].freeze

Constants inherited from Components::WhitespaceTokenizer

Components::WhitespaceTokenizer::TOKEN_PATTERN

Instance Method Summary collapse

Methods inherited from Components::WhitespaceTokenizer

#pattern, #punctuation?, #word_char?

Methods inherited from Components::Tokenizer

#tokenize_to_strings

Constructor Details

#initialize(expand_contractions: true) ⇒ Tokenizer

Returns a new instance of Tokenizer.



148
149
150
151
# File 'lib/kotoshu/languages/en/language.rb', line 148

def initialize(expand_contractions: true)
  super()
  @expand_contractions = expand_contractions
end

Instance Method Details

#tokenize(text) ⇒ Object



153
154
155
156
157
158
159
160
# File 'lib/kotoshu/languages/en/language.rb', line 153

def tokenize(text)
  return [] if text.nil? || text.empty?
  tokens = super
  if @expand_contractions
    tokens = expand_contractions(tokens)
  end
  tokens
end