Class: Kotoshu::Languages::English
Overview
English language implementation.
Supports multiple dialects: en-US, en-GB, en-AU, en-CA, en-NZ, en-ZA
Defined Under Namespace
Classes: POSTagger, SpellChecker, Tokenizer
Constant Summary
collapse
- HUNSPELL_DICTIONARIES =
{
'en-US' => {
aff: 'spec/integrational/fixtures/en_US.aff',
dic: 'spec/integrational/fixtures/en_US.dic'
},
}.freeze
- VARIANT_NAMES =
{
'US' => 'American',
'GB' => 'British',
'CA' => 'Canadian',
'AU' => 'Australian',
'NZ' => 'New Zealand',
'ZA' => 'South African'
}.freeze
Instance Attribute Summary
#code, #name, #region, #variant
Instance Method Summary
collapse
#base_code, #base_language?, #compatible_with?, #encoding, #full_name, #info, instance, #matches_code?, #normalize, #normalize_word, #region_code, register, #rtl?, #tokenize, #valid_word?
Constructor Details
#initialize(code: "en", name: "English", variant: nil) ⇒ English
Returns a new instance of English.
346
347
348
349
350
|
# File 'lib/kotoshu/languages/en/language.rb', line 346
def initialize(code: "en", name: "English", variant: nil)
variant ||= (code)
super(code: code, name: name, variant: variant)
@hunspell_paths = resolve_hunspell_paths(code)
end
|
Instance Method Details
#create_grammar_rules ⇒ Object
406
407
408
|
# File 'lib/kotoshu/languages/en/language.rb', line 406
def create_grammar_rules
Grammar::RuleEngine.new(language: 'en')
end
|
#create_pos_tagger ⇒ Object
397
398
399
400
401
402
403
404
|
# File 'lib/kotoshu/languages/en/language.rb', line 397
def create_pos_tagger
POSTagger.new(
aff_path: @hunspell_paths[:aff],
dic_path: @hunspell_paths[:dic],
script: :latin,
flag_mapping: english_pos_flag_mapping
)
end
|
#create_spell_checker ⇒ Object
385
386
387
388
389
390
391
|
# File 'lib/kotoshu/languages/en/language.rb', line 385
def create_spell_checker
SpellChecker.new(
aff_path: @hunspell_paths[:aff],
dic_path: @hunspell_paths[:dic],
script: :latin
)
end
|
#create_tokenizer ⇒ Object
393
394
395
|
# File 'lib/kotoshu/languages/en/language.rb', line 393
def create_tokenizer
Tokenizer.new
end
|
#default_dictionary_paths ⇒ Object
370
371
372
373
374
375
376
377
378
379
|
# File 'lib/kotoshu/languages/en/language.rb', line 370
def default_dictionary_paths
case code
when "en-GB", "en-AU", "en-NZ", "en-ZA"
["/usr/share/dict/british-english"]
when "en-US", "en-CA"
["/usr/share/dict/american-english"]
else
["/usr/share/dict/words"]
end
end
|
#description ⇒ Object
352
353
354
355
356
|
# File 'lib/kotoshu/languages/en/language.rb', line 352
def description
return name unless variant
variant_name = VARIANT_NAMES[variant] || variant
"#{name} (#{variant_name})"
end
|
#dictionary_class ⇒ Object
366
367
368
|
# File 'lib/kotoshu/languages/en/language.rb', line 366
def dictionary_class
Dictionary::UnixWords
end
|
#normalizer ⇒ Object
362
363
364
|
# File 'lib/kotoshu/languages/en/language.rb', line 362
def normalizer
@normalizer ||= Language::Normalizer::Base.new
end
|
#script_type ⇒ Object
381
382
383
|
# File 'lib/kotoshu/languages/en/language.rb', line 381
def script_type
:latin
end
|
#tokenizer ⇒ Object
358
359
360
|
# File 'lib/kotoshu/languages/en/language.rb', line 358
def tokenizer
@tokenizer ||= Tokenizer.new
end
|
#valid_in_other_variant?(word) ⇒ Boolean
410
411
412
413
414
415
416
417
418
419
420
421
422
423
|
# File 'lib/kotoshu/languages/en/language.rb', line 410
def valid_in_other_variant?(word)
return nil if @variant.nil? || @code == 'en'
HUNSPELL_DICTIONARIES.each do |variant_code, paths|
next if variant_code == @code
next unless File.exist?(paths[:aff]) && File.exist?(paths[:dic])
checker = SpellChecker.new(aff_path: paths[:aff], dic_path: paths[:dic], script: :latin, encoding: 'ISO-8859-1')
if checker.correct?(word)
region = variant_code.split('-').last.upcase
variant_name = VARIANT_NAMES[region] || variant_code
return { variant: variant_name, code: "en-#{region}" }
end
end
nil
end
|