Class: Kotoshu::Languages::Japanese

Inherits:
Kotoshu::Language::Base show all
Defined in:
lib/kotoshu/languages/ja/language.rb

Overview

Japanese language implementation.

Supports ja-JP with full CJK script support.

Uses morphological analysis via Suika gem for tokenization and POS tagging. Japanese spell checking uses dictionary lookup with CJK character support.

Defined Under Namespace

Modules: GrammarRules Classes: POSTagger, SpellChecker, Tokenizer

Constant Summary collapse

HUNSPELL_DICTIONARIES =
{
  'ja-JP' => {
    # Japanese dictionaries are in custom formats
    # Suika uses its own dictionary format
  }
}.freeze
VARIANT_NAMES =
{
  'JP' => 'Japan'
}.freeze

Instance Attribute Summary

Attributes inherited from Kotoshu::Language::Base

#code, #name, #region, #variant

Instance Method Summary collapse

Methods inherited from Kotoshu::Language::Base

#base_code, #base_language?, #compatible_with?, #encoding, #full_name, #info, instance, #matches_code?, #normalize, #normalize_word, #region_code, register, #rtl?, #tokenize, #valid_word?

Constructor Details

#initialize(code: "ja", name: "Japanese", variant: nil) ⇒ Japanese

Returns a new instance of Japanese.



419
420
421
422
# File 'lib/kotoshu/languages/ja/language.rb', line 419

def initialize(code: "ja", name: "Japanese", variant: nil)
  variant ||= extract_region_code(code)
  super(code: code, name: name, variant: variant)
end

Instance Method Details

#create_pos_taggerObject



462
463
464
465
466
467
# File 'lib/kotoshu/languages/ja/language.rb', line 462

def create_pos_tagger
  POSTagger.new(
    dictionary_path: default_dictionary_paths.first,
    flag_mapping: POSTagger::FLAG_TO_POS
  )
end

#create_spell_checkerObject



450
451
452
453
454
455
456
# File 'lib/kotoshu/languages/ja/language.rb', line 450

def create_spell_checker
  # Japanese uses custom dictionary, not Hunspell format
  SpellChecker.new(
    dic_path: default_dictionary_paths.first,
    script: :cjk
  )
end

#create_tokenizerObject



458
459
460
# File 'lib/kotoshu/languages/ja/language.rb', line 458

def create_tokenizer
  Tokenizer.new
end

#default_dictionary_pathsObject



442
443
444
# File 'lib/kotoshu/languages/ja/language.rb', line 442

def default_dictionary_paths
  ["/usr/share/dict/words"]
end

#descriptionObject



424
425
426
427
428
# File 'lib/kotoshu/languages/ja/language.rb', line 424

def description
  return name unless variant
  variant_name = VARIANT_NAMES[variant] || variant
  "#{name} (#{variant_name})"
end

#dictionary_classObject



438
439
440
# File 'lib/kotoshu/languages/ja/language.rb', line 438

def dictionary_class
  Dictionary::UnixWords
end

#normalizerObject



434
435
436
# File 'lib/kotoshu/languages/ja/language.rb', line 434

def normalizer
  @normalizer ||= Language::Normalizer::Base.new
end

#script_typeObject



446
447
448
# File 'lib/kotoshu/languages/ja/language.rb', line 446

def script_type
  :cjk
end

#tokenizerObject



430
431
432
# File 'lib/kotoshu/languages/ja/language.rb', line 430

def tokenizer
  @tokenizer ||= Tokenizer.new
end