Class: Kotoshu::Languages::English

Inherits:
Kotoshu::Language::Base show all
Defined in:
lib/kotoshu/languages/en/language.rb

Overview

English language implementation.

Supports multiple dialects: en-US, en-GB, en-AU, en-CA, en-NZ, en-ZA

Examples:

American English

lang = Kotoshu::Languages::English.new(code: "en-US")
checker = lang.create_spell_checker
checker.correct?("color")    # => true
checker.correct?("colour")   # => false

British English

lang = Kotoshu::Languages::English.new(code: "en-GB")
checker.correct?("colour")   # => true

Defined Under Namespace

Classes: POSTagger, SpellChecker, Tokenizer

Constant Summary collapse

HUNSPELL_DICTIONARIES =
{
  'en-US' => {
    aff: 'spec/integrational/fixtures/en_US.aff',
    dic: 'spec/integrational/fixtures/en_US.dic'
  },
}.freeze
VARIANT_NAMES =
{
  'US' => 'American',
  'GB' => 'British',
  'CA' => 'Canadian',
  'AU' => 'Australian',
  'NZ' => 'New Zealand',
  'ZA' => 'South African'
}.freeze

Instance Attribute Summary

Attributes inherited from Kotoshu::Language::Base

#code, #name, #region, #variant

Instance Method Summary collapse

Methods inherited from Kotoshu::Language::Base

#base_code, #base_language?, #compatible_with?, #encoding, #full_name, #info, instance, #matches_code?, #normalize, #normalize_word, #region_code, register, #rtl?, #tokenize, #valid_word?

Constructor Details

#initialize(code: "en", name: "English", variant: nil) ⇒ English

Returns a new instance of English.



346
347
348
349
350
# File 'lib/kotoshu/languages/en/language.rb', line 346

def initialize(code: "en", name: "English", variant: nil)
  variant ||= extract_region_code(code)
  super(code: code, name: name, variant: variant)
  @hunspell_paths = resolve_hunspell_paths(code)
end

Instance Method Details

#create_grammar_rulesObject



406
407
408
# File 'lib/kotoshu/languages/en/language.rb', line 406

def create_grammar_rules
  Grammar::RuleEngine.new(language: 'en')
end

#create_pos_taggerObject



397
398
399
400
401
402
403
404
# File 'lib/kotoshu/languages/en/language.rb', line 397

def create_pos_tagger
  POSTagger.new(
    aff_path: @hunspell_paths[:aff],
    dic_path: @hunspell_paths[:dic],
    script: :latin,
    flag_mapping: english_pos_flag_mapping
  )
end

#create_spell_checkerObject



385
386
387
388
389
390
391
# File 'lib/kotoshu/languages/en/language.rb', line 385

def create_spell_checker
  SpellChecker.new(
    aff_path: @hunspell_paths[:aff],
    dic_path: @hunspell_paths[:dic],
    script: :latin
  )
end

#create_tokenizerObject



393
394
395
# File 'lib/kotoshu/languages/en/language.rb', line 393

def create_tokenizer
  Tokenizer.new
end

#default_dictionary_pathsObject



370
371
372
373
374
375
376
377
378
379
# File 'lib/kotoshu/languages/en/language.rb', line 370

def default_dictionary_paths
  case code
  when "en-GB", "en-AU", "en-NZ", "en-ZA"
    ["/usr/share/dict/british-english"]
  when "en-US", "en-CA"
    ["/usr/share/dict/american-english"]
  else
    ["/usr/share/dict/words"]
  end
end

#descriptionObject



352
353
354
355
356
# File 'lib/kotoshu/languages/en/language.rb', line 352

def description
  return name unless variant
  variant_name = VARIANT_NAMES[variant] || variant
  "#{name} (#{variant_name})"
end

#dictionary_classObject



366
367
368
# File 'lib/kotoshu/languages/en/language.rb', line 366

def dictionary_class
  Dictionary::UnixWords
end

#normalizerObject



362
363
364
# File 'lib/kotoshu/languages/en/language.rb', line 362

def normalizer
  @normalizer ||= Language::Normalizer::Base.new
end

#script_typeObject



381
382
383
# File 'lib/kotoshu/languages/en/language.rb', line 381

def script_type
  :latin
end

#tokenizerObject



358
359
360
# File 'lib/kotoshu/languages/en/language.rb', line 358

def tokenizer
  @tokenizer ||= Tokenizer.new
end

#valid_in_other_variant?(word) ⇒ Boolean

Returns:

  • (Boolean)


410
411
412
413
414
415
416
417
418
419
420
421
422
423
# File 'lib/kotoshu/languages/en/language.rb', line 410

def valid_in_other_variant?(word)
  return nil if @variant.nil? || @code == 'en'
  HUNSPELL_DICTIONARIES.each do |variant_code, paths|
    next if variant_code == @code
    next unless File.exist?(paths[:aff]) && File.exist?(paths[:dic])
    checker = SpellChecker.new(aff_path: paths[:aff], dic_path: paths[:dic], script: :latin, encoding: 'ISO-8859-1')
    if checker.correct?(word)
      region = variant_code.split('-').last.upcase
      variant_name = VARIANT_NAMES[region] || variant_code
      return { variant: variant_name, code: "en-#{region}" }
    end
  end
  nil
end