Class: Kotoshu::Languages::English::POSTagger

Inherits:
Components::PosTagger show all
Defined in:
lib/kotoshu/languages/en/language.rb

Overview

English POS tagger.

Constant Summary collapse

FLAG_TO_POS =
{
  'N' => 'NOUN', 'NN' => 'NOUN', 'NNS' => 'NOUN', 'NNP' => 'NOUN', 'NP' => 'NOUN_PROPER',
  'V' => 'VERB', 'VB' => 'VERB', 'VBD' => 'VERB', 'VBG' => 'VERB', 'VBN' => 'VERB',
  'VBP' => 'VERB', 'VBZ' => 'VERB', 'MD' => 'VERB_MODAL',
  'A' => 'ADJ', 'JJ' => 'ADJ', 'JJR' => 'ADJ', 'JJS' => 'ADJ',
  'R' => 'ADV', 'RB' => 'ADV', 'RBR' => 'ADV', 'RBS' => 'ADV',
  'D' => 'DET', 'DT' => 'DET', 'PDT' => 'DET',
  'P' => 'PRON', 'PP' => 'PRON', 'PRP' => 'PRON', 'PRP$' => 'PRON_POSS',
  'WP' => 'PRON', 'WP$' => 'PRON_POSS',
  'I' => 'PREP', 'IN' => 'PREP',
  'C' => 'CONJ', 'CC' => 'CONJ',
  'U' => 'PART', 'RP' => 'PART',
  'INTJ' => 'INTJ', 'UH' => 'INTJ',
  'CD' => 'NUM', 'FW' => 'X',
  'PUNCT' => 'PUNCT', '.' => 'PUNCT', ',' => 'PUNCT', '!' => 'PUNCT',
  '?' => 'PUNCT', ';' => 'PUNCT', ':' => 'PUNCT'
}.freeze

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods inherited from Components::PosTagger

#tag_word

Constructor Details

#initialize(aff_path:, dic_path:, script: :latin, encoding: 'ISO-8859-1', flag_mapping: FLAG_TO_POS) ⇒ POSTagger

Returns a new instance of POSTagger.



235
236
237
238
239
240
241
242
243
# File 'lib/kotoshu/languages/en/language.rb', line 235

def initialize(aff_path:, dic_path:, script: :latin, encoding: 'ISO-8859-1', flag_mapping: FLAG_TO_POS)
  @aff_path = aff_path
  @dic_path = dic_path
  @script = script
  @encoding = encoding
  @flag_mapping = flag_mapping
  @lookuper = Readers::LookupBuilder.new(aff_path, dic_path, encoding: encoding, script: script).build
  @lookup_cache = {}
end

Instance Attribute Details

#aff_pathObject (readonly)

Returns the value of attribute aff_path.



233
234
235
# File 'lib/kotoshu/languages/en/language.rb', line 233

def aff_path
  @aff_path
end

#dic_pathObject (readonly)

Returns the value of attribute dic_path.



233
234
235
# File 'lib/kotoshu/languages/en/language.rb', line 233

def dic_path
  @dic_path
end

#scriptObject (readonly)

Returns the value of attribute script.



233
234
235
# File 'lib/kotoshu/languages/en/language.rb', line 233

def script
  @script
end

Instance Method Details

#clear_cacheObject



266
267
268
# File 'lib/kotoshu/languages/en/language.rb', line 266

def clear_cache
  @lookup_cache.clear
end

#flag_mappingObject



258
259
260
# File 'lib/kotoshu/languages/en/language.rb', line 258

def flag_mapping
  @flag_mapping
end

#flag_mapping=(mapping) ⇒ Object



262
263
264
# File 'lib/kotoshu/languages/en/language.rb', line 262

def flag_mapping=(mapping)
  @flag_mapping = mapping
end

#tag(tokens) ⇒ Object



245
246
247
248
249
250
251
252
253
254
255
256
# File 'lib/kotoshu/languages/en/language.rb', line 245

def tag(tokens)
  return [] if tokens.nil? || tokens.empty?
  tokens.map do |token|
    word = token[:token]
    if word.nil? || word.empty?
      token.merge(pos_tag: nil, lemma: nil)
    else
      lookup_result = lookup_with_pos(word)
      token.merge(pos_tag: lookup_result[:pos_tag], lemma: lookup_result[:lemma] || word)
    end
  end
end