Class: Kotoshu::Languages::Japanese
Overview
Japanese language implementation.
Supports ja-JP with full CJK script support.
Uses morphological analysis via Suika gem for tokenization and POS tagging. Japanese spell checking uses dictionary lookup with CJK character support.
Defined Under Namespace
Modules: GrammarRules
Classes: POSTagger, SpellChecker, Tokenizer
Constant Summary
collapse
- HUNSPELL_DICTIONARIES =
{
'ja-JP' => {
}
}.freeze
- VARIANT_NAMES =
{
'JP' => 'Japan'
}.freeze
Instance Attribute Summary
#code, #name, #region, #variant
Instance Method Summary
collapse
#base_code, #base_language?, #compatible_with?, #encoding, #full_name, #info, instance, #matches_code?, #normalize, #normalize_word, #region_code, register, #rtl?, #tokenize, #valid_word?
Constructor Details
#initialize(code: "ja", name: "Japanese", variant: nil) ⇒ Japanese
Returns a new instance of Japanese.
419
420
421
422
|
# File 'lib/kotoshu/languages/ja/language.rb', line 419
def initialize(code: "ja", name: "Japanese", variant: nil)
variant ||= (code)
super(code: code, name: name, variant: variant)
end
|
Instance Method Details
#create_pos_tagger ⇒ Object
462
463
464
465
466
467
|
# File 'lib/kotoshu/languages/ja/language.rb', line 462
def create_pos_tagger
POSTagger.new(
dictionary_path: default_dictionary_paths.first,
flag_mapping: POSTagger::FLAG_TO_POS
)
end
|
#create_spell_checker ⇒ Object
450
451
452
453
454
455
456
|
# File 'lib/kotoshu/languages/ja/language.rb', line 450
def create_spell_checker
SpellChecker.new(
dic_path: default_dictionary_paths.first,
script: :cjk
)
end
|
#create_tokenizer ⇒ Object
458
459
460
|
# File 'lib/kotoshu/languages/ja/language.rb', line 458
def create_tokenizer
Tokenizer.new
end
|
#default_dictionary_paths ⇒ Object
442
443
444
|
# File 'lib/kotoshu/languages/ja/language.rb', line 442
def default_dictionary_paths
["/usr/share/dict/words"]
end
|
#description ⇒ Object
424
425
426
427
428
|
# File 'lib/kotoshu/languages/ja/language.rb', line 424
def description
return name unless variant
variant_name = VARIANT_NAMES[variant] || variant
"#{name} (#{variant_name})"
end
|
#dictionary_class ⇒ Object
438
439
440
|
# File 'lib/kotoshu/languages/ja/language.rb', line 438
def dictionary_class
Dictionary::UnixWords
end
|
#normalizer ⇒ Object
434
435
436
|
# File 'lib/kotoshu/languages/ja/language.rb', line 434
def normalizer
@normalizer ||= Language::Normalizer::Base.new
end
|
#script_type ⇒ Object
446
447
448
|
# File 'lib/kotoshu/languages/ja/language.rb', line 446
def script_type
:cjk
end
|
#tokenizer ⇒ Object
430
431
432
|
# File 'lib/kotoshu/languages/ja/language.rb', line 430
def tokenizer
@tokenizer ||= Tokenizer.new
end
|