Class: Kotoshu::Languages::German::POSTagger

Inherits:
Components::PosTagger show all
Defined in:
lib/kotoshu/languages/de/language.rb

Overview

German POS tagger.

Derives POS tags from Hunspell flags using German-specific mappings.

Constant Summary collapse

FLAG_TO_POS =

German POS flag mappings based on Hunspell German dictionaries

{
  # Nouns (German nouns are capitalized)
  'N' => 'NOUN', 'NN' => 'NOUN', 'NNS' => 'NOUN', 'NNP' => 'NOUN_PROPER',
  'Sub' => 'NOUN',
  # Verbs
  'V' => 'VERB', 'VB' => 'VERB', 'VBD' => 'VERB', 'VBG' => 'VERB', 'VBN' => 'VERB',
  'VBP' => 'VERB', 'VBZ' => 'VERB',
  'Vfin' => 'VERB', 'Vinf' => 'VERB', 'Vpp' => 'VERB',
  # Adjectives
  'A' => 'ADJ', 'JJ' => 'ADJ', 'JJR' => 'ADJ', 'JJS' => 'ADJ',
  'Adj' => 'ADJ',
  # Adverbs
  'R' => 'ADV', 'RB' => 'ADV', 'RBR' => 'ADV', 'RBS' => 'ADV',
  'Adv' => 'ADV',
  # Determiners
  'D' => 'DET', 'DT' => 'DET', 'PDT' => 'DET',
  'Art' => 'DET',
  # Pronouns
  'P' => 'PRON', 'PP' => 'PRON', 'PRP' => 'PRON', 'PRP$' => 'PRON_POSS',
  'WP' => 'PRON', 'WP$' => 'PRON_POSS',
  'Pro' => 'PRON',
  # Prepositions
  'I' => 'PREP', 'IN' => 'PREP',
  'Prä' => 'PREP',
  # Conjunctions
  'C' => 'CONJ', 'CC' => 'CONJ',
  'Kon' => 'CONJ',
  # Particles
  'U' => 'PART', 'RP' => 'PART',
  'Pt' => 'PART',
  # Interjections
  'INTJ' => 'INTJ', 'UH' => 'INTJ',
  'Int' => 'INTJ',
  # Numbers
  'CD' => 'NUM',
  'Num' => 'NUM',
  # Foreign words
  'FW' => 'X',
  # Punctuation
  'PUNCT' => 'PUNCT', '.' => 'PUNCT', ',' => 'PUNCT', '!' => 'PUNCT',
  '?' => 'PUNCT', ';' => 'PUNCT', ':' => 'PUNCT'
}.freeze

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods inherited from Components::PosTagger

#tag_word

Constructor Details

#initialize(aff_path:, dic_path:, script: :latin, encoding: 'UTF-8', flag_mapping: FLAG_TO_POS) ⇒ POSTagger

Returns a new instance of POSTagger.



231
232
233
234
235
236
237
238
239
# File 'lib/kotoshu/languages/de/language.rb', line 231

def initialize(aff_path:, dic_path:, script: :latin, encoding: 'UTF-8', flag_mapping: FLAG_TO_POS)
  @aff_path = aff_path
  @dic_path = dic_path
  @script = script
  @encoding = encoding
  @flag_mapping = flag_mapping
  @lookuper = Readers::LookupBuilder.new(aff_path, dic_path, encoding: encoding, script: script).build
  @lookup_cache = {}
end

Instance Attribute Details

#aff_pathObject (readonly)

Returns the value of attribute aff_path.



229
230
231
# File 'lib/kotoshu/languages/de/language.rb', line 229

def aff_path
  @aff_path
end

#dic_pathObject (readonly)

Returns the value of attribute dic_path.



229
230
231
# File 'lib/kotoshu/languages/de/language.rb', line 229

def dic_path
  @dic_path
end

#scriptObject (readonly)

Returns the value of attribute script.



229
230
231
# File 'lib/kotoshu/languages/de/language.rb', line 229

def script
  @script
end

Instance Method Details

#clear_cacheObject



262
263
264
# File 'lib/kotoshu/languages/de/language.rb', line 262

def clear_cache
  @lookup_cache.clear
end

#flag_mappingObject



254
255
256
# File 'lib/kotoshu/languages/de/language.rb', line 254

def flag_mapping
  @flag_mapping
end

#flag_mapping=(mapping) ⇒ Object



258
259
260
# File 'lib/kotoshu/languages/de/language.rb', line 258

def flag_mapping=(mapping)
  @flag_mapping = mapping
end

#tag(tokens) ⇒ Object



241
242
243
244
245
246
247
248
249
250
251
252
# File 'lib/kotoshu/languages/de/language.rb', line 241

def tag(tokens)
  return [] if tokens.nil? || tokens.empty?
  tokens.map do |token|
    word = token[:token]
    if word.nil? || word.empty?
      token.merge(pos_tag: nil, lemma: nil)
    else
      lookup_result = lookup_with_pos(word)
      token.merge(pos_tag: lookup_result[:pos_tag], lemma: lookup_result[:lemma] || word)
    end
  end
end