Class: Kotoshu::Languages::Portuguese
Overview
Portuguese language implementation.
Supports multiple dialects: pt-BR, pt-PT, pt-AO, pt-MZ, pt-GW, pt-CV
Full Hunspell integration with spell checking, POS tagging, and grammar rules specifically handling Portuguese accents and Brazilian vs European differences.
Defined Under Namespace
Modules: GrammarRules
Classes: POSTagger, SpellChecker, Tokenizer
Constant Summary
collapse
- HUNSPELL_DICTIONARIES =
{
'pt-BR' => {
aff: 'spec/integrational/fixtures/pt_BR.aff',
dic: 'spec/integrational/fixtures/pt_BR.dic'
},
'pt-PT' => {
aff: 'spec/integrational/fixtures/pt_PT.aff',
dic: 'spec/integrational/fixtures/pt_PT.dic'
}
}.freeze
- VARIANT_NAMES =
{
'BR' => 'Brazilian',
'PT' => 'European',
'AO' => 'Angolan',
'MZ' => 'Mozambican',
'GW' => 'Guinea-Bissau',
'CV' => 'Cape Verdean'
}.freeze
Instance Attribute Summary
#code, #name, #region, #variant
Instance Method Summary
collapse
#base_code, #base_language?, #compatible_with?, #encoding, #full_name, #info, instance, #matches_code?, #normalize, #normalize_word, #region_code, register, #rtl?, #tokenize, #valid_word?
Constructor Details
#initialize(code: "pt", name: "Portuguese", variant: nil) ⇒ Portuguese
Returns a new instance of Portuguese.
351
352
353
354
355
|
# File 'lib/kotoshu/languages/pt/language.rb', line 351
def initialize(code: "pt", name: "Portuguese", variant: nil)
variant ||= (code)
super(code: code, name: name, variant: variant)
@hunspell_paths = resolve_hunspell_paths(code)
end
|
Instance Method Details
#create_pos_tagger ⇒ Object
402
403
404
405
406
407
408
409
|
# File 'lib/kotoshu/languages/pt/language.rb', line 402
def create_pos_tagger
POSTagger.new(
aff_path: @hunspell_paths[:aff],
dic_path: @hunspell_paths[:dic],
script: :latin,
flag_mapping: POSTagger::FLAG_TO_POS
)
end
|
#create_spell_checker ⇒ Object
390
391
392
393
394
395
396
|
# File 'lib/kotoshu/languages/pt/language.rb', line 390
def create_spell_checker
SpellChecker.new(
aff_path: @hunspell_paths[:aff],
dic_path: @hunspell_paths[:dic],
script: :latin
)
end
|
#create_tokenizer ⇒ Object
398
399
400
|
# File 'lib/kotoshu/languages/pt/language.rb', line 398
def create_tokenizer
Tokenizer.new
end
|
#default_dictionary_paths ⇒ Object
375
376
377
378
379
380
381
382
383
384
|
# File 'lib/kotoshu/languages/pt/language.rb', line 375
def default_dictionary_paths
case code
when "pt-BR"
["/usr/share/dict/brazilian"]
when "pt-PT"
["/usr/share/dict/portuguese"]
else
["/usr/share/dict/words"]
end
end
|
#description ⇒ Object
357
358
359
360
361
|
# File 'lib/kotoshu/languages/pt/language.rb', line 357
def description
return name unless variant
variant_name = VARIANT_NAMES[variant] || variant
"#{name} (#{variant_name})"
end
|
#dictionary_class ⇒ Object
371
372
373
|
# File 'lib/kotoshu/languages/pt/language.rb', line 371
def dictionary_class
Dictionary::UnixWords
end
|
#normalizer ⇒ Object
367
368
369
|
# File 'lib/kotoshu/languages/pt/language.rb', line 367
def normalizer
@normalizer ||= Language::Normalizer::Base.new
end
|
#script_type ⇒ Object
386
387
388
|
# File 'lib/kotoshu/languages/pt/language.rb', line 386
def script_type
:latin
end
|
#tokenizer ⇒ Object
363
364
365
|
# File 'lib/kotoshu/languages/pt/language.rb', line 363
def tokenizer
@tokenizer ||= Tokenizer.new
end
|