Class: UEAStemmer
- Inherits:
-
Object
show all
- Includes:
- StringHelpers
- Defined in:
- lib/uea-stemmer/word.rb,
lib/uea-stemmer.rb,
lib/uea-stemmer/rule.rb
Overview
Copyright 2005 University of East Anglia
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
Authored by Marie-Claire Jenkins and Dr. Dan J. Smith. Ported to Java from Perl by Richard Churchill. Ported from Java to Ruby by Jason M. Adams.
Defined Under Namespace
Classes: ConcatenatingEndingRule, CustomRule, EndingRule, ExhaustiveConcatenatingEndingRule, NonExhaustiveEndingRule, Rule, Word
Constant Summary
collapse
- APOSTROPHE_PATTERN =
/['’]/
- PROBLEM_WORDS =
%w[is as this has was during menses].freeze
- SPECIAL_RULE_COUNT =
4
Instance Attribute Summary collapse
Instance Method Summary
collapse
#ends_with?, #remove_suffix
Constructor Details
#initialize(max_word_length = nil, max_acronym_length = nil, options = {}) ⇒ UEAStemmer
Returns a new instance of UEAStemmer.
35
36
37
38
39
40
41
42
|
# File 'lib/uea-stemmer.rb', line 35
def initialize(max_word_length = nil, max_acronym_length = nil, options = {})
@max_word_length = max_word_length || 'deoxyribonucleicacid'.size
@max_acronym_length = max_acronym_length || 'CAVASSOO'.size
@options = options.transform_keys(&:to_sym).freeze
@rules = []
create_rules
end
|
Instance Attribute Details
#max_acronym_length ⇒ Object
Returns the value of attribute max_acronym_length.
33
34
35
|
# File 'lib/uea-stemmer.rb', line 33
def max_acronym_length
@max_acronym_length
end
|
#max_word_length ⇒ Object
Returns the value of attribute max_word_length.
33
34
35
|
# File 'lib/uea-stemmer.rb', line 33
def max_word_length
@max_word_length
end
|
Instance Method Details
#add_rule(rule) ⇒ Object
89
90
91
92
93
94
95
96
|
# File 'lib/uea-stemmer.rb', line 89
def add_rule(rule)
if rule.is_a?(Rule)
@rules << rule.dup.freeze
true
else
false
end
end
|
#num_rules ⇒ Object
81
82
83
|
# File 'lib/uea-stemmer.rb', line 81
def num_rules
@rules.map { |r| r.rule_num }.uniq.size + SPECIAL_RULE_COUNT
end
|
#options ⇒ Object
48
49
50
|
# File 'lib/uea-stemmer.rb', line 48
def options
@options
end
|
#rules ⇒ Object
44
45
46
|
# File 'lib/uea-stemmer.rb', line 44
def rules
@rules.dup.freeze
end
|
#stem(word) ⇒ Object
77
78
79
|
# File 'lib/uea-stemmer.rb', line 77
def stem(word)
stem_with_rule(word).word
end
|
#stem_with_rule(word) ⇒ Object
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
|
# File 'lib/uea-stemmer.rb', line 52
def stem_with_rule(word)
stemmed_word = word.dup
if problem_word?(word)
Word.new(word, 94)
elsif (word.size > @max_acronym_length && word =~ /^[A-Z]+$/) || (word.size > (@max_acronym_length + 1) && word =~ /^[A-Z]+s$/)
Word.new(word, 96)
elsif word.size > @max_word_length
Word.new(word, 95)
elsif word =~ APOSTROPHE_PATTERN
if word =~ /^.*['’]s$/i
stemmed_word = remove_suffix(stemmed_word, 2)
elsif word =~ /^.*['’]$/
stemmed_word = remove_suffix(stemmed_word, 1)
end
stemmed_word = expand_contractions(stemmed_word) unless options[:skip_contractions]
Word.new(stemmed_word, 93)
else
stemmed_word, rule_num, rule = apply_rules(stemmed_word)
Word.new(stemmed_word, rule_num, rule)
end
end
|
#to_s ⇒ Object
85
86
87
|
# File 'lib/uea-stemmer.rb', line 85
def to_s
"UEA-Lite Stemmer (#{num_rules} rules)"
end
|