Class: UEAStemmer

Inherits:
Object
  • Object
show all
Includes:
StringHelpers
Defined in:
lib/uea-stemmer/word.rb,
lib/uea-stemmer.rb,
lib/uea-stemmer/rule.rb

Overview

Copyright 2005 University of East Anglia

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

Authored by Marie-Claire Jenkins and Dr. Dan J. Smith. Ported to Java from Perl by Richard Churchill. Ported from Java to Ruby by Jason M. Adams.

Direct Known Subclasses

DefaultUEAStemmer

Defined Under Namespace

Classes: ConcatenatingEndingRule, CustomRule, EndingRule, ExhaustiveConcatenatingEndingRule, NonExhaustiveEndingRule, Rule, Word

Constant Summary collapse

APOSTROPHE_PATTERN =
/['’]/
PROBLEM_WORDS =
%w[is as this has was during menses].freeze
SPECIAL_RULE_COUNT =
4

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods included from StringHelpers

#ends_with?, #remove_suffix

Constructor Details

#initialize(max_word_length = nil, max_acronym_length = nil, options = {}) ⇒ UEAStemmer

Returns a new instance of UEAStemmer.



35
36
37
38
39
40
41
42
# File 'lib/uea-stemmer.rb', line 35

def initialize(max_word_length = nil, max_acronym_length = nil, options = {})
  @max_word_length = max_word_length || 'deoxyribonucleicacid'.size
  @max_acronym_length = max_acronym_length || 'CAVASSOO'.size
  @options = options.transform_keys(&:to_sym).freeze

  @rules = []
  create_rules
end

Instance Attribute Details

#max_acronym_lengthObject (readonly)

Returns the value of attribute max_acronym_length.



33
34
35
# File 'lib/uea-stemmer.rb', line 33

def max_acronym_length
  @max_acronym_length
end

#max_word_lengthObject (readonly)

Returns the value of attribute max_word_length.



33
34
35
# File 'lib/uea-stemmer.rb', line 33

def max_word_length
  @max_word_length
end

Instance Method Details

#add_rule(rule) ⇒ Object



89
90
91
92
93
94
95
96
# File 'lib/uea-stemmer.rb', line 89

def add_rule(rule)
  if rule.is_a?(Rule)
    @rules << rule.dup.freeze
    true
  else
    false
  end
end

#num_rulesObject



81
82
83
# File 'lib/uea-stemmer.rb', line 81

def num_rules
  @rules.map { |r| r.rule_num }.uniq.size + SPECIAL_RULE_COUNT
end

#optionsObject



48
49
50
# File 'lib/uea-stemmer.rb', line 48

def options
  @options
end

#rulesObject



44
45
46
# File 'lib/uea-stemmer.rb', line 44

def rules
  @rules.dup.freeze
end

#stem(word) ⇒ Object



77
78
79
# File 'lib/uea-stemmer.rb', line 77

def stem(word)
  stem_with_rule(word).word
end

#stem_with_rule(word) ⇒ Object



52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# File 'lib/uea-stemmer.rb', line 52

def stem_with_rule(word)
  stemmed_word = word.dup

  if problem_word?(word)
    Word.new(word, 94)
  elsif (word.size > @max_acronym_length && word =~ /^[A-Z]+$/) || (word.size > (@max_acronym_length + 1) && word =~ /^[A-Z]+s$/)
    Word.new(word, 96)
  elsif word.size > @max_word_length
    Word.new(word, 95)
  elsif word =~ APOSTROPHE_PATTERN
    if word =~ /^.*['’]s$/i
      stemmed_word = remove_suffix(stemmed_word, 2)
    elsif word =~ /^.*['’]$/
      stemmed_word = remove_suffix(stemmed_word, 1)
    end

    stemmed_word = expand_contractions(stemmed_word) unless options[:skip_contractions]

    Word.new(stemmed_word, 93)
  else
    stemmed_word, rule_num, rule = apply_rules(stemmed_word)
    Word.new(stemmed_word, rule_num, rule)
  end
end

#to_sObject



85
86
87
# File 'lib/uea-stemmer.rb', line 85

def to_s
  "UEA-Lite Stemmer (#{num_rules} rules)"
end