Module: Kabosu

Defined in:
lib/kabosu.rb,
lib/kabosu/version.rb,
lib/kabosu/pos_matcher.rb,
lib/kabosu/dict_manager.rb,
lib/kabosu/morpheme_list.rb

Defined Under Namespace

Classes: ConfigError, DictManager, Dictionary, DictionaryError, Error, LookupError, Morpheme, MorphemeList, PosMatcher, SentenceRange, SentenceSplitError, TokenizationError, Tokenizer

Constant Summary collapse

MODE_A =
:a
MODE_B =
:b
MODE_C =
:c
VERSION =
"0.6.10.dev.20260226.98055fb"

Class Method Summary collapse

Class Method Details

.split_sentences(text, limit: nil, with_checker: false, ranges: false, dictionary: nil) ⇒ Object



181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
# File 'lib/kabosu.rb', line 181

def self.split_sentences(text, limit: nil, with_checker: false, ranges: false, dictionary: nil)
  unless text.is_a?(String)
    raise ArgumentError, "text must be a String"
  end
  unless limit.nil? || limit.is_a?(Integer)
    raise ArgumentError, "limit must be an Integer or nil"
  end
  if limit && limit < 1
    raise ArgumentError, "limit must be greater than 0"
  end
  unless with_checker == true || with_checker == false
    raise ArgumentError, "with_checker must be true or false"
  end
  unless ranges == true || ranges == false
    raise ArgumentError, "ranges must be true or false"
  end
  unless dictionary.nil? || dictionary.is_a?(String)
    raise ArgumentError, "dictionary must be a String path or nil"
  end

  dict_path = nil
  if with_checker
    dict_path = dictionary || Dictionary.path
  end

  if ranges
    _split_sentences_with_ranges(text, limit, dict_path).map do |(start, finish, sentence)|
      SentenceRange.new(start, finish, sentence)
    end
  else
    _split_sentences(text, limit, dict_path)
  end
rescue RuntimeError => e
  raise SentenceSplitError.new(e.message), cause: e
end

.tokenize(text, tokenizer:) ⇒ Object

Tokenize text using an explicitly provided tokenizer.

dict = Kabosu::Dictionary.new(system_dict: Kabosu::Dictionary.path)
tok = dict.create(mode: :a)
Kabosu.tokenize("東京都に住んでいる", tokenizer: tok)


225
226
227
228
229
230
231
232
233
234
235
236
237
238
# File 'lib/kabosu.rb', line 225

def self.tokenize(text, tokenizer:)
  unless text.is_a?(String)
    raise ArgumentError, "text must be a String"
  end
  unless tokenizer.is_a?(Tokenizer)
    raise ArgumentError, "tokenizer must be a Kabosu::Tokenizer"
  end

  batch = tokenizer.__send__(:_tokenize, text)
  cost = batch.respond_to?(:internal_cost) ? batch.internal_cost : nil
  MorphemeList.new(batch, internal_cost: cost)
rescue RuntimeError => e
  raise TokenizationError.new(e.message), cause: e
end