Module: Kotoshu

Defined in:
lib/kotoshu/embeddings.rb,
lib/kotoshu.rb,
lib/kotoshu.rb,
lib/kotoshu/cli.rb,
lib/kotoshu/core.rb,
lib/kotoshu/cache.rb,
lib/kotoshu/paths.rb,
lib/kotoshu/grammar.rb,
lib/kotoshu/readers.rb,
lib/kotoshu/version.rb,
lib/kotoshu/defaults.rb,
lib/kotoshu/keyboard.rb,
lib/kotoshu/language.rb,
lib/kotoshu/integrity.rb,
lib/kotoshu/languages.rb,
lib/kotoshu/algorithms.rb,
lib/kotoshu/cli/errors.rb,
lib/kotoshu/debug_mode.rb,
lib/kotoshu/cache/cache.rb,
lib/kotoshu/debug_logger.rb,
lib/kotoshu/grammar/rule.rb,
lib/kotoshu/spellchecker.rb,
lib/kotoshu/configuration.rb,
lib/kotoshu/cli/auto_setup.rb,
lib/kotoshu/core/trie/node.rb,
lib/kotoshu/core/trie/trie.rb,
lib/kotoshu/fluent_checker.rb,
lib/kotoshu/metrics_module.rb,
lib/kotoshu/models/context.rb,
lib/kotoshu/plugins/plugin.rb,
lib/kotoshu/project_config.rb,
lib/kotoshu/results/result.rb,
lib/kotoshu/string_metrics.rb,
lib/kotoshu/core/exceptions.rb,
lib/kotoshu/dictionary/base.rb,
lib/kotoshu/keyboard/layout.rb,
lib/kotoshu/resource_bundle.rb,
lib/kotoshu/source_registry.rb,
lib/kotoshu/cache/base_cache.rb,
lib/kotoshu/core/models/word.rb,
lib/kotoshu/plugins/registry.rb,
lib/kotoshu/readers/aff_data.rb,
lib/kotoshu/resource_manager.rb,
lib/kotoshu/algorithms/lookup.rb,
lib/kotoshu/cache/model_cache.rb,
lib/kotoshu/cli/cache_command.rb,
lib/kotoshu/cli/status_report.rb,
lib/kotoshu/core/trie/builder.rb,
lib/kotoshu/dictionary/cspell.rb,
lib/kotoshu/dictionary/custom.rb,
lib/kotoshu/keyboard/registry.rb,
lib/kotoshu/language/detector.rb,
lib/kotoshu/language/registry.rb,
lib/kotoshu/metrics_collector.rb,
lib/kotoshu/models/onnx_model.rb,
lib/kotoshu/models/suggestion.rb,
lib/kotoshu/algorithms/suggest.rb,
lib/kotoshu/cache/lookup_cache.rb,
lib/kotoshu/cli/batch_reporter.rb,
lib/kotoshu/dictionary/unified.rb,
lib/kotoshu/documents/document.rb,
lib/kotoshu/documents/location.rb,
lib/kotoshu/integrity/manifest.rb,
lib/kotoshu/integrity/net_http.rb,
lib/kotoshu/readers/aff_reader.rb,
lib/kotoshu/readers/dic_reader.rb,
lib/kotoshu/dictionary/hunspell.rb,
lib/kotoshu/grammar/rule_engine.rb,
lib/kotoshu/grammar/rule_loader.rb,
lib/kotoshu/integrity/audit_log.rb,
lib/kotoshu/language/identifier.rb,
lib/kotoshu/personal_dictionary.rb,
lib/kotoshu/readers/file_reader.rb,
lib/kotoshu/suggestions/context.rb,
lib/kotoshu/cache/language_cache.rb,
lib/kotoshu/components/tokenizer.rb,
lib/kotoshu/dictionaries/catalog.rb,
lib/kotoshu/suggestions/pipeline.rb,
lib/kotoshu/cache/frequency_cache.rb,
lib/kotoshu/cli/display_formatter.rb,
lib/kotoshu/cli/language_resolver.rb,
lib/kotoshu/cli/progress_reporter.rb,
lib/kotoshu/components/pos_tagger.rb,
lib/kotoshu/configuration/builder.rb,
lib/kotoshu/dictionary/plain_text.rb,
lib/kotoshu/dictionary/repository.rb,
lib/kotoshu/dictionary/unix_words.rb,
lib/kotoshu/languages/de/language.rb,
lib/kotoshu/languages/en/language.rb,
lib/kotoshu/languages/es/language.rb,
lib/kotoshu/languages/fr/language.rb,
lib/kotoshu/languages/ja/language.rb,
lib/kotoshu/languages/pt/language.rb,
lib/kotoshu/languages/ru/language.rb,
lib/kotoshu/models/fasttext_model.rb,
lib/kotoshu/models/semantic_error.rb,
lib/kotoshu/models/word_embedding.rb,
lib/kotoshu/suggestions/generator.rb,
lib/kotoshu/cache/suggestion_cache.rb,
lib/kotoshu/cli/navigation_manager.rb,
lib/kotoshu/commands/cache_command.rb,
lib/kotoshu/commands/check_command.rb,
lib/kotoshu/commands/model_command.rb,
lib/kotoshu/components/synthesizer.rb,
lib/kotoshu/configuration/resolver.rb,
lib/kotoshu/core/models/affix_rule.rb,
lib/kotoshu/models/embedding_model.rb,
lib/kotoshu/readers/lookup_builder.rb,
lib/kotoshu/suggestions/suggestion.rb,
lib/kotoshu/algorithms/permutations.rb,
lib/kotoshu/core/indexed_dictionary.rb,
lib/kotoshu/keyboard/layouts/azerty.rb,
lib/kotoshu/keyboard/layouts/dvorak.rb,
lib/kotoshu/keyboard/layouts/jcuken.rb,
lib/kotoshu/keyboard/layouts/qwerty.rb,
lib/kotoshu/keyboard/layouts/qwertz.rb,
lib/kotoshu/language/languages/base.rb,
lib/kotoshu/language/tokenizer/base.rb,
lib/kotoshu/models/nearest_neighbor.rb,
lib/kotoshu/algorithms/ngram_suggest.rb,
lib/kotoshu/cli/interactive_reviewer.rb,
lib/kotoshu/components/spell_checker.rb,
lib/kotoshu/data/common_words_loader.rb,
lib/kotoshu/language/normalizer/base.rb,
lib/kotoshu/algorithms/capitalization.rb,
lib/kotoshu/algorithms/phonet_suggest.rb,
lib/kotoshu/readers/condition_checker.rb,
lib/kotoshu/suggestions/suggestion_set.rb,
lib/kotoshu/analyzers/semantic_analyzer.rb,
lib/kotoshu/documents/asciidoc_document.rb,
lib/kotoshu/documents/markdown_document.rb,
lib/kotoshu/data_structures/bloom_filter.rb,
lib/kotoshu/embeddings/similarity_search.rb,
lib/kotoshu/documents/plain_text_document.rb,
lib/kotoshu/spellchecker/parallel_checker.rb,
lib/kotoshu/core/models/result/word_result.rb,
lib/kotoshu/components/whitespace_tokenizer.rb,
lib/kotoshu/core/models/result/document_result.rb,
lib/kotoshu/language/tokenizer/latin_tokenizer.rb,
lib/kotoshu/language/tokenizer/french_tokenizer.rb,
lib/kotoshu/language/tokenizer/german_tokenizer.rb,
lib/kotoshu/components/passthrough_spell_checker.rb,
lib/kotoshu/language/tokenizer/russian_tokenizer.rb,
lib/kotoshu/language/tokenizer/spanish_tokenizer.rb,
lib/kotoshu/suggestions/strategies/base_strategy.rb,
lib/kotoshu/grammar/pattern_matchers/base_matcher.rb,
lib/kotoshu/language/tokenizer/japanese_tokenizer.rb,
lib/kotoshu/suggestions/strategies/ngram_strategy.rb,
lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb,
lib/kotoshu/suggestions/strategies/phonetic_strategy.rb,
lib/kotoshu/suggestions/strategies/semantic_strategy.rb,
lib/kotoshu/suggestions/strategies/symspell_strategy.rb,
lib/kotoshu/suggestions/strategies/composite_strategy.rb,
lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb,
lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb,
lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb,
lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb,
lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb

Overview

Embeddings module for FastText ONNX integration.

Provides semantic spell checking using FastText word embeddings. Supports 157 languages through pre-converted ONNX models.

Examples:

Simple usage (recommended)

pipeline = Kotoshu::Embeddings.from_cache(language: 'en')
neighbors = pipeline.find_nearest('semantic', k: 5)

Advanced usage

vocab = Kotoshu::Embeddings::Vocabulary.from_file('vocab.json')
model = Kotoshu::Embeddings::OnnxRuntimeModel.from_file('model.onnx')
engine = Kotoshu::Embeddings::SimilarityEngine.new(pre_normalize: true)

Defined Under Namespace

Modules: Algorithms, Analyzers, Cache, Cli, Components, Core, Data, DataStructures, Debug, Defaults, Dictionaries, Dictionary, Documents, Embeddings, Grammar, Integrity, Keyboard, Language, Languages, Metrics, Models, Paths, Plugins, Readers, Results, StringMetrics, Suggestions Classes: AffixRuleError, CacheCommand, CheckCommand, Configuration, ConfigurationError, DictionaryNotFoundError, Error, IntegrityError, InvalidDictionaryFormatError, ModelCommand, PersonalDictionary, ProjectConfig, ResourceBundle, ResourceManager, ResourceNotCachedError, ResourceNotSetupError, ResourceResolutionError, SourceRegistry, SpellcheckError, Spellchecker

Constant Summary collapse

VERSION =
"0.3.0"

Class Method Summary collapse

Class Method Details

.check(text, language: nil, **_options) ⇒ Models::Result::DocumentResult

Check text for spelling errors. Hot path.

Examples:

Kotoshu.setup(:en)
result = Kotoshu.check("Hello wrold")
result.errors.map(&:word)  # => ["wrold"]

Parameters:

  • text (String)

    The text to check

  • language (String, Symbol, nil) (defaults to: nil)

    Language code; if nil, uses configured default

  • options (Hash)

    Options

Returns:

Raises:



322
323
324
325
# File 'lib/kotoshu.rb', line 322

def self.check(text, language: nil, **_options)
  checker = language ? spellchecker_for(language) : spellchecker
  checker.check(text)
end

.check_file(path, language: nil, **_options) ⇒ Models::Result::DocumentResult

Check a file for spelling errors. Hot path.

Examples:

Kotoshu.setup(:en)
result = Kotoshu.check_file("README.md")
result.success?  # => false

Parameters:

  • path (String)

    The file path

  • language (String, Symbol, nil) (defaults to: nil)

    Language code

  • options (Hash)

    Options

Returns:

Raises:



339
340
341
342
# File 'lib/kotoshu.rb', line 339

def self.check_file(path, language: nil, **_options)
  checker = language ? spellchecker_for(language) : spellchecker
  checker.check_file(path)
end

.check_files(paths, **options) ⇒ Array<Models::Result::DocumentResult>

Check multiple files for spelling errors.

Examples:

results = Kotoshu.check_files(%w[README.md CHANGELOG.md])
results.select(&:failed?)

Parameters:

  • paths (Array<String>)

    The file paths

  • options (Hash)

    Options

Returns:



353
354
355
# File 'lib/kotoshu.rb', line 353

def self.check_files(paths, **options)
  paths.map { |path| check_file(path, **options) }
end

.configurationConfiguration

Get the global configuration.

Examples:

config = Kotoshu.configuration

Returns:



135
136
137
# File 'lib/kotoshu.rb', line 135

def self.configuration
  Configuration.instance
end

.configure {|configuration| ... } ⇒ Configuration

Global configuration instance.

Examples:

Kotoshu.configure do |config|
  config.dictionary_path = "/usr/share/dict/words"
  config.language = "en-US"
end

Yields:

Returns:



124
125
126
127
# File 'lib/kotoshu.rb', line 124

def self.configure
  yield configuration if block_given?
  configuration
end

.correct?(word, language: nil) ⇒ Boolean

Check if a word is spelled correctly. Hot path — cache-only, raises if language not set up.

Examples:

Kotoshu.setup(:en)
Kotoshu.correct?("hello")            # => true
Kotoshu.correct?("Hallo", language: "de")  # requires Kotoshu.setup(:de) first

Parameters:

  • word (String)

    The word to check

  • language (String, Symbol, nil) (defaults to: nil)

    Language code; if nil, uses configured default

Returns:

  • (Boolean)

    True if the word is correct

Raises:



278
279
280
281
# File 'lib/kotoshu.rb', line 278

def self.correct?(word, language: nil)
  checker = language ? spellchecker_for(language) : spellchecker
  checker.correct?(word)
end

.detect_language(text) ⇒ String?

Detect language of text.

Examples:

Kotoshu.detect_language("Bonjour le monde")  # => "fr"
Kotoshu.detect_language("こんにちは")        # => "ja"

Parameters:

  • text (String)

    Text to analyze

Returns:

  • (String, nil)

    Detected language code



445
446
447
# File 'lib/kotoshu.rb', line 445

def self.detect_language(text)
  Language.detect(text)
end

.detect_language_with_confidence(text) ⇒ Array<String, Float>

Detect language with confidence score.

Examples:

lang, conf = Kotoshu.detect_language_with_confidence("Hello world")
lang  # => "en"
conf  # => 0.85

Parameters:

  • text (String)

    Text to analyze

Returns:

  • (Array<String, Float>)

    Language code and confidence



458
459
460
# File 'lib/kotoshu.rb', line 458

def self.detect_language_with_confidence(text)
  Language.detect_with_confidence(text)
end

.dictionary(source = nil) ⇒ Core::IndexedDictionary

Convenience method for creating an indexed dictionary.

Parameters:

  • source (Array<String>, Hash, nil) (defaults to: nil)

    Words or file path

Returns:



361
362
363
364
365
366
367
368
369
370
371
372
# File 'lib/kotoshu.rb', line 361

def self.dictionary(source = nil)
  case source
  when Array
    Core::IndexedDictionary.new(source)
  when String
    Core::IndexedDictionary.from_file(source)
  when nil, Hash
    Core::IndexedDictionary.new
  else
    raise ArgumentError, "Invalid dictionary source: #{source.inspect}"
  end
end

.get_language(code) ⇒ Class?

Get language class by code.

Examples:

Kotoshu.get_language("en-US")

Parameters:

  • code (String)

    Language code (e.g., “en-US”, “de-DE”)

Returns:

  • (Class, nil)

    Language class or nil



469
470
471
# File 'lib/kotoshu.rb', line 469

def self.get_language(code)
  Language.get(code)
end

.languageModule

Access the language module.

Examples:

Kotoshu::Language.detect("Hello world")  # => "en"

Returns:

  • (Module)

    The Language module



433
434
435
# File 'lib/kotoshu.rb', line 433

def self.language
  Language
end

.language_registered?(code) ⇒ Boolean

Check if a language is registered.

Examples:

Kotoshu.language_registered?("en-US")  # => true or false

Parameters:

  • code (String)

    Language code

Returns:

  • (Boolean)

    True if registered



480
481
482
# File 'lib/kotoshu.rb', line 480

def self.language_registered?(code)
  Language.registered?(code)
end

.languages_setupArray<String>

List languages that have been set up.

Examples:

Kotoshu.languages_setup  # => ["de", "en", "fr"]

Returns:

  • (Array<String>)

    Sorted array of language codes with cached spelling



249
250
251
# File 'lib/kotoshu.rb', line 249

def self.languages_setup
  ResourceManager.languages_setup
end

.misspelled?(word, language: nil) ⇒ Boolean

Check if a word is misspelled. Hot path.

Parameters:

  • word (String)

    The word to check

  • language (String, Symbol, nil) (defaults to: nil)

    Language code

Returns:

  • (Boolean)

    True if the word is misspelled

Raises:



289
290
291
# File 'lib/kotoshu.rb', line 289

def self.misspelled?(word, language: nil)
  !correct?(word, language: language)
end

.register_dictionary_type(type, klass) ⇒ Object

Register a custom dictionary type.

Examples:

Kotoshu.register_dictionary_type(:my_custom, MyDictionary)

Parameters:

  • type (Symbol)

    The type key

  • klass (Class)

    The dictionary class



412
413
414
# File 'lib/kotoshu.rb', line 412

def self.register_dictionary_type(type, klass)
  Dictionary.register_type(type, klass)
end

.register_suggestion_algorithm(name, klass) ⇒ Object

Register a custom suggestion algorithm.

Examples:

Kotoshu.register_suggestion_algorithm(:my_custom, MyStrategy)

Parameters:

  • name (Symbol)

    The algorithm name

  • klass (Class)

    The algorithm class



423
424
425
# File 'lib/kotoshu.rb', line 423

def self.register_suggestion_algorithm(name, klass)
  Suggestions::Strategies::BaseStrategy.register_type(name, klass)
end

.reset_spellcheckerObject

Reset the spellchecker cache. The next call to ‘spellchecker` or `spellchecker_for` re-resolves from the current configuration.

Does NOT eagerly reload — clearing the cache is enough. This makes the method safe to call between tests even when no language is set up yet (the next call will raise ResourceNotSetupError per the strict two-stage contract).



260
261
262
263
264
# File 'lib/kotoshu.rb', line 260

def self.reset_spellchecker
  @spellchecker = nil
  @spellcheckers = nil
  nil
end

.resolve(language: nil, want: nil) ⇒ ResourceBundle

Resolve language resources from the cache (no download).

Examples:

Kotoshu.setup(:en)
bundle = Kotoshu.resolve(language: "en")
bundle.dictionary  # => #<Dictionary::Hunspell ...>

Parameters:

  • language (String, Symbol, nil) (defaults to: nil)

    Language code; if nil, uses default

  • want (Array<Symbol>) (defaults to: nil)

    Resource types (default: [:spelling])

Returns:

Raises:



183
184
185
186
187
188
189
# File 'lib/kotoshu.rb', line 183

def self.resolve(language: nil, want: nil)
  lang = language || configuration.default_language
  raise ResourceNotSetupError.new(lang || "default", "spelling") if lang.nil?

  want_param = want || ResourceManager::DEFAULT_WANT
  ResourceManager.resolve(language: lang, want: want_param)
end

.setup(*languages, want: nil, **opts) ⇒ SetupResult+

Set up resources for one or more languages (download or register local files). Idempotent: re-running with the same args is a no-op unless ‘force: true`.

Examples:

Download from kotoshu/dictionaries

Kotoshu.setup(:en)                                 # spelling only
Kotoshu.setup(:en, want: %i[spelling frequency])   # spelling + frequency
Kotoshu.setup(:en, :de, :fr)                       # multiple languages

Register local files (user already has hunspell dicts)

Kotoshu.setup(:en, aff: "/usr/share/hunspell/en_US.aff",
                    dic: "/usr/share/hunspell/en_US.dic")

Register local files from a directory

Kotoshu.setup(:en, from: "/usr/share/hunspell/")  # looks for en.aff, en.dic

Parameters:

  • languages (String, Symbol, Array<String, Symbol>)

    One or more language codes

  • want (Array<Symbol>) (defaults to: nil)

    Resource types to fetch (default: [:spelling])

  • force (Boolean)

    Re-fetch even if already cached

  • strict (Boolean)

    Re-raise on optional-resource failure

  • aff (String, nil)

    Path to local .aff file (single-language only)

  • dic (String, nil)

    Path to local .dic file (single-language only)

  • from (String, nil)

    Directory containing local .aff/.dic (single-language only)

  • frequency (String, nil)

    Path to local frequency.json (single-language only)

Returns:

  • (SetupResult, Array<SetupResult>)

    Result or results (array if multiple languages)

Raises:

  • (ArgumentError)


217
218
219
220
221
222
223
224
225
226
# File 'lib/kotoshu.rb', line 217

def self.setup(*languages, want: nil, **opts)
  raise ArgumentError, "Kotoshu.setup requires at least one language" if languages.empty?

  want_param = want || ResourceManager::DEFAULT_WANT
  if languages.size == 1
    ResourceManager.setup(languages.first, want: want_param, **opts)
  else
    languages.map { |lang| ResourceManager.setup(lang, want: want_param, **opts) }
  end
end

.setup?(language, resource = nil) ⇒ Boolean

Check if a language (or a specific resource for that language) is set up.

Examples:

Kotoshu.setup(:en)
Kotoshu.setup?(:en)              # => true
Kotoshu.setup?(:en, :spelling)   # => true
Kotoshu.setup?(:en, :frequency)  # => false (not set up)

Parameters:

  • language (String, Symbol)

    Language code

  • resource (Symbol, nil) (defaults to: nil)

    :spelling, :frequency, :model, or nil for any

Returns:

  • (Boolean)

    True if the resource is cached and available



239
240
241
# File 'lib/kotoshu.rb', line 239

def self.setup?(language, resource = nil)
  ResourceManager.setup?(language, resource: resource)
end

.spellcheckerSpellchecker

Default spellchecker (singleton). Uses the configured default language. Cache-only — raises ResourceNotSetupError if the default language hasn’t been set up via Kotoshu.setup.

Returns:

Raises:



145
146
147
148
149
150
151
152
# File 'lib/kotoshu.rb', line 145

def self.spellchecker
  return @spellchecker if @spellchecker

  lang = configuration.default_language
  raise ResourceNotSetupError.new(lang || "default", "spelling") if lang.nil? || lang.to_s.empty?

  @spellchecker = spellchecker_for(lang)
end

.spellchecker_for(language) ⇒ Spellchecker

Get a spellchecker for a specific language (cache-only, raises on miss).

Examples:

Kotoshu.setup(:de)
Kotoshu.spellchecker_for("de").correct?("Hallo")  # => true

Parameters:

  • language (String, Symbol)

    Language code (e.g., “en”, “de”, “fr”)

Returns:

  • (Spellchecker)

    Spellchecker using a ResourceManager-resolved bundle

Raises:



163
164
165
166
167
168
169
170
# File 'lib/kotoshu.rb', line 163

def self.spellchecker_for(language)
  key = language.to_s
  @spellcheckers ||= {}
  @spellcheckers[key] ||= begin
    bundle = ResourceManager.resolve(language: language)
    Spellchecker.new(resource_bundle: bundle, config: configuration)
  end
end

.suggest(word, language: nil, **options) ⇒ Suggestions::SuggestionSet

Get spelling suggestions for a word. Hot path.

Examples:

Kotoshu.setup(:en)
suggestions = Kotoshu.suggest("helo")
suggestions.to_words  # => ["hello", "help", "held", ...]

Parameters:

  • word (String)

    The misspelled word

  • language (String, Symbol, nil) (defaults to: nil)

    Language code

  • options (Hash)

    Options (max_suggestions, etc.)

Returns:

Raises:



305
306
307
308
# File 'lib/kotoshu.rb', line 305

def self.suggest(word, language: nil, **options)
  checker = language ? spellchecker_for(language) : spellchecker
  checker.suggest(word, **options)
end

.suggestion_pipeline(*strategies) ⇒ Suggestions::Strategies::CompositeStrategy

Convenience method for creating a suggestion pipeline.

Parameters:

  • strategies (Array)

    Optional strategies to add

Returns:



399
400
401
402
403
# File 'lib/kotoshu.rb', line 399

def self.suggestion_pipeline(*strategies)
  pipeline = Suggestions::Strategies::CompositeStrategy.new(name: :default)
  strategies.each { |s| pipeline.add(s) }
  pipeline
end

.supported_languagesArray<String>

Get all supported language codes.

Examples:

Kotoshu.supported_languages  # => ["de-DE", "en-US", "fr-FR", ...]

Returns:

  • (Array<String>)

    List of language codes



490
491
492
# File 'lib/kotoshu.rb', line 490

def self.supported_languages
  Language.supported_codes
end

.trie(source = nil) ⇒ Core::Trie::Trie

Convenience method for creating a trie.

Parameters:

  • source (Array<String>, String, nil) (defaults to: nil)

    Words or file path

Returns:



378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
# File 'lib/kotoshu.rb', line 378

def self.trie(source = nil)
  case source
  when Array
    Core::Trie::Builder.from_array(source)
  when String
    if File.exist?(source)
      Core::Trie::Builder.from_file(source)
    else
      Core::Trie::Builder.from_string(source)
    end
  when nil
    Core::Trie::Trie.new
  else
    raise ArgumentError, "Invalid trie source: #{source.inspect}"
  end
end