Module: Philiprehberger::FuzzyMatch

Defined in:
lib/philiprehberger/fuzzy_match.rb,
lib/philiprehberger/fuzzy_match/lcs.rb,
lib/philiprehberger/fuzzy_match/dice.rb,
lib/philiprehberger/fuzzy_match/hamming.rb,
lib/philiprehberger/fuzzy_match/soundex.rb,
lib/philiprehberger/fuzzy_match/version.rb,
lib/philiprehberger/fuzzy_match/metaphone.rb,
lib/philiprehberger/fuzzy_match/levenshtein.rb,
lib/philiprehberger/fuzzy_match/jaro_winkler.rb,
lib/philiprehberger/fuzzy_match/damerau_levenshtein.rb

Defined Under Namespace

Modules: DamerauLevenshtein, Dice, Hamming, JaroWinkler, Lcs, Levenshtein, Metaphone, Soundex Classes: Error

Constant Summary collapse

VERSION =
'0.5.0'

Class Method Summary collapse

Class Method Details

.best(query, candidates, threshold: 0.0) ⇒ Object



82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# File 'lib/philiprehberger/fuzzy_match.rb', line 82

def self.best(query, candidates, threshold: 0.0)
  return nil if candidates.empty?

  best_result = nil
  best_score = -1.0

  candidates.each do |candidate|
    score = ratio(query, candidate.to_s)
    if score > best_score
      best_result = candidate
      best_score = score
    end
  end

  return nil if best_score < threshold

  { match: best_result, score: best_score.round(4) }
end

.damerau_levenshtein(str_a, str_b) ⇒ Integer

Damerau-Levenshtein edit distance (counts transpositions as 1 edit)

Parameters:

  • str_a (String)
  • str_b (String)

Returns:

  • (Integer)


53
54
55
# File 'lib/philiprehberger/fuzzy_match.rb', line 53

def self.damerau_levenshtein(str_a, str_b)
  DamerauLevenshtein.distance(str_a, str_b)
end

.damerau_ratio(str_a, str_b) ⇒ Float

Normalized Damerau-Levenshtein similarity (0.0 to 1.0)

Parameters:

  • str_a (String)
  • str_b (String)

Returns:

  • (Float)


62
63
64
65
66
67
68
69
70
# File 'lib/philiprehberger/fuzzy_match.rb', line 62

def self.damerau_ratio(str_a, str_b)
  a = str_a.to_s.downcase
  b = str_b.to_s.downcase
  max_len = [a.length, b.length].max
  return 1.0 if max_len.zero?

  distance = DamerauLevenshtein.distance(a, b)
  1.0 - (distance.to_f / max_len)
end

.deduplicate(array, threshold: 0.8, algorithm: :jaro_winkler) ⇒ Array<String>

Group and deduplicate similar strings

Parameters:

  • array (Array<String>)

    strings to deduplicate

  • threshold (Float) (defaults to: 0.8)

    similarity threshold (0.0 to 1.0)

  • algorithm (Symbol) (defaults to: :jaro_winkler)

    :jaro_winkler (default), :dice, or :levenshtein

Returns:

  • (Array<String>)

    unique representatives



230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
# File 'lib/philiprehberger/fuzzy_match.rb', line 230

def self.deduplicate(array, threshold: 0.8, algorithm: :jaro_winkler)
  representatives = []
  array.each do |item|
    duplicate = representatives.any? do |rep|
      score = case algorithm
              when :jaro_winkler then jaro_winkler(rep, item)
              when :dice then dice_coefficient(rep, item)
              when :levenshtein then ratio(rep, item)
              else raise Error, "Unknown algorithm: #{algorithm}"
              end
      score >= threshold
    end
    representatives << item unless duplicate
  end
  representatives
end

.dice_coefficient(str_a, str_b) ⇒ Object



26
27
28
# File 'lib/philiprehberger/fuzzy_match.rb', line 26

def self.dice_coefficient(str_a, str_b)
  Dice.coefficient(str_a, str_b)
end

.hamming(str_a, str_b) ⇒ Integer

Hamming distance for equal-length strings

Parameters:

  • str_a (String)
  • str_b (String)

Returns:

  • (Integer)

Raises:

  • (Error)

    if strings have different lengths



149
150
151
# File 'lib/philiprehberger/fuzzy_match.rb', line 149

def self.hamming(str_a, str_b)
  Hamming.distance(str_a, str_b)
end

.jaro_winkler(str_a, str_b) ⇒ Object



22
23
24
# File 'lib/philiprehberger/fuzzy_match.rb', line 22

def self.jaro_winkler(str_a, str_b)
  JaroWinkler.similarity(str_a, str_b)
end

.lcs(str_a, str_b) ⇒ Integer

Length of the longest common subsequence

Parameters:

  • str_a (String)
  • str_b (String)

Returns:

  • (Integer)


35
36
37
# File 'lib/philiprehberger/fuzzy_match.rb', line 35

def self.lcs(str_a, str_b)
  Lcs.length(str_a, str_b)
end

.lcs_ratio(str_a, str_b) ⇒ Float

Normalized LCS similarity (0.0 to 1.0)

Parameters:

  • str_a (String)
  • str_b (String)

Returns:

  • (Float)


44
45
46
# File 'lib/philiprehberger/fuzzy_match.rb', line 44

def self.lcs_ratio(str_a, str_b)
  Lcs.ratio(str_a, str_b)
end

.levenshtein(str_a, str_b) ⇒ Object



18
19
20
# File 'lib/philiprehberger/fuzzy_match.rb', line 18

def self.levenshtein(str_a, str_b)
  Levenshtein.distance(str_a, str_b)
end

.metaphone(string) ⇒ String

Generate a Metaphone code for a string

Parameters:

  • string (String)

Returns:

  • (String)

    Metaphone phonetic code



128
129
130
# File 'lib/philiprehberger/fuzzy_match.rb', line 128

def self.metaphone(string)
  Metaphone.code(string)
end

.phonetic_match?(a, b) ⇒ Boolean

Check if two strings match phonetically (same Soundex code)

Parameters:

  • a (String)
  • b (String)

Returns:

  • (Boolean)


137
138
139
140
141
# File 'lib/philiprehberger/fuzzy_match.rb', line 137

def self.phonetic_match?(a, b)
  sa = Soundex.code(a)
  sb = Soundex.code(b)
  !sa.empty? && sa == sb
end

.ratio(str_a, str_b) ⇒ Object



72
73
74
75
76
77
78
79
80
# File 'lib/philiprehberger/fuzzy_match.rb', line 72

def self.ratio(str_a, str_b)
  a = str_a.to_s.downcase
  b = str_b.to_s.downcase
  max_len = [a.length, b.length].max
  return 1.0 if max_len.zero?

  distance = Levenshtein.distance(a, b)
  1.0 - (distance.to_f / max_len)
end

.search(query, candidates, threshold: 0.3) ⇒ Object



101
102
103
104
105
106
107
108
109
# File 'lib/philiprehberger/fuzzy_match.rb', line 101

def self.search(query, candidates, threshold: 0.3)
  scored = candidates.map do |candidate|
    score = ratio(query, candidate.to_s)
    { match: candidate, score: score.round(4) }
  end

  results = scored.select { |r| r[:score] >= threshold }
  results.sort_by { |r| -r[:score] }
end

.soundex(string) ⇒ String

Generate a Soundex code for a string

Parameters:

  • string (String)

Returns:

  • (String)

    4-character Soundex code



120
121
122
# File 'lib/philiprehberger/fuzzy_match.rb', line 120

def self.soundex(string)
  Soundex.code(string)
end

.suggest(query, candidates, threshold: 0.6, max: 5) ⇒ Object



111
112
113
114
# File 'lib/philiprehberger/fuzzy_match.rb', line 111

def self.suggest(query, candidates, threshold: 0.6, max: 5)
  results = search(query, candidates, threshold: threshold)
  results.first(max).map { |r| r[:match] }
end

.token_set_ratio(str_a, str_b) ⇒ Float

Token-set ratio: compare token sets using Jaccard-like string similarity

Parameters:

  • str_a (String)
  • str_b (String)

Returns:

  • (Float)

    similarity between 0.0 and 1.0



169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
# File 'lib/philiprehberger/fuzzy_match.rb', line 169

def self.token_set_ratio(str_a, str_b)
  tokens_a = str_a.to_s.downcase.split.uniq.sort
  tokens_b = str_b.to_s.downcase.split.uniq.sort

  return 1.0 if tokens_a == tokens_b
  return 0.0 if tokens_a.empty? && tokens_b.empty?

  intersection = tokens_a & tokens_b
  combined = (tokens_a | tokens_b).sort

  common = intersection.join(' ')
  rest_a = (tokens_a - intersection).sort.join(' ')
  rest_b = (tokens_b - intersection).sort.join(' ')

  joined_a = [common, rest_a].reject(&:empty?).join(' ')
  joined_b = [common, rest_b].reject(&:empty?).join(' ')
  joined_full = combined.join(' ')

  scores = [
    JaroWinkler.similarity(joined_a, joined_b),
    JaroWinkler.similarity(common, joined_a),
    JaroWinkler.similarity(common, joined_b),
    JaroWinkler.similarity(common, joined_full)
  ]

  scores.compact.max || 0.0
end

.token_sort_ratio(str_a, str_b) ⇒ Float

Token-sort ratio: sort tokens alphabetically, then compute Jaro-Winkler similarity

Parameters:

  • str_a (String)
  • str_b (String)

Returns:

  • (Float)

    similarity between 0.0 and 1.0



158
159
160
161
162
# File 'lib/philiprehberger/fuzzy_match.rb', line 158

def self.token_sort_ratio(str_a, str_b)
  a = str_a.to_s.downcase.split.sort.join(' ')
  b = str_b.to_s.downcase.split.sort.join(' ')
  JaroWinkler.similarity(a, b)
end

.weighted_score(str_a, str_b, weights:) ⇒ Float

Weighted score combining multiple algorithms

Parameters:

  • str_a (String)
  • str_b (String)
  • weights (Hash)

    algorithm => weight pairs (must sum to 1.0)

Returns:

  • (Float)

    weighted similarity score

Raises:

  • (Error)

    if weights do not sum to 1.0



204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
# File 'lib/philiprehberger/fuzzy_match.rb', line 204

def self.weighted_score(str_a, str_b, weights:)
  sum = weights.values.reduce(0.0, :+)
  raise Error, "Weights must sum to 1.0, got #{sum}" unless (sum - 1.0).abs < 1e-9

  algorithm_map = {
    jaro_winkler: ->(a, b) { jaro_winkler(a, b) },
    dice: ->(a, b) { dice_coefficient(a, b) },
    levenshtein_ratio: ->(a, b) { ratio(a, b) },
    lcs_ratio: ->(a, b) { lcs_ratio(a, b) },
    damerau_ratio: ->(a, b) { damerau_ratio(a, b) }
  }

  weights.reduce(0.0) do |total, (algo, weight)|
    fn = algorithm_map[algo]
    raise Error, "Unknown algorithm: #{algo}" unless fn

    total + (fn.call(str_a, str_b) * weight)
  end
end