Module: Philiprehberger::FuzzyMatch

Defined in:
lib/philiprehberger/fuzzy_match.rb,
lib/philiprehberger/fuzzy_match/lcs.rb,
lib/philiprehberger/fuzzy_match/dice.rb,
lib/philiprehberger/fuzzy_match/hamming.rb,
lib/philiprehberger/fuzzy_match/soundex.rb,
lib/philiprehberger/fuzzy_match/version.rb,
lib/philiprehberger/fuzzy_match/metaphone.rb,
lib/philiprehberger/fuzzy_match/levenshtein.rb,
lib/philiprehberger/fuzzy_match/jaro_winkler.rb,
lib/philiprehberger/fuzzy_match/damerau_levenshtein.rb

Defined Under Namespace

Modules: DamerauLevenshtein, Dice, Hamming, JaroWinkler, Lcs, Levenshtein, Metaphone, Soundex Classes: Error

Constant Summary collapse

VERSION =
'0.8.0'

Class Method Summary collapse

Class Method Details

.best(query, candidates, threshold: 0.0) ⇒ Object



82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# File 'lib/philiprehberger/fuzzy_match.rb', line 82

def self.best(query, candidates, threshold: 0.0)
  return nil if candidates.empty?

  best_result = nil
  best_score = -1.0

  candidates.each do |candidate|
    score = ratio(query, candidate.to_s)
    if score > best_score
      best_result = candidate
      best_score = score
    end
  end

  return nil if best_score < threshold

  { match: best_result, score: best_score.round(4) }
end

.closest_n(query, candidates, n:, algorithm: :jaro_winkler) ⇒ Array<Hash>

Return the top N matches sorted by score descending

Parameters:

  • query (String)
  • candidates (Array<String>)
  • n (Integer)

    number of results to return (required)

  • algorithm (Symbol) (defaults to: :jaro_winkler)

    :jaro_winkler (default), :dice, or :levenshtein

Returns:

  • (Array<Hash>)

    up to n candidates as ‘{ match:, score: }` sorted by score desc



137
138
139
140
# File 'lib/philiprehberger/fuzzy_match.rb', line 137

def self.closest_n(query, candidates, n:, algorithm: :jaro_winkler)
  ranked = rank(query, candidates, algorithm: algorithm)
  ranked.first(n).map { |entry| { match: entry[:value], score: entry[:score] } }
end

.damerau_levenshtein(str_a, str_b) ⇒ Integer

Damerau-Levenshtein edit distance (counts transpositions as 1 edit)

Parameters:

  • str_a (String)
  • str_b (String)

Returns:

  • (Integer)


53
54
55
# File 'lib/philiprehberger/fuzzy_match.rb', line 53

def self.damerau_levenshtein(str_a, str_b)
  DamerauLevenshtein.distance(str_a, str_b)
end

.damerau_ratio(str_a, str_b) ⇒ Float

Normalized Damerau-Levenshtein similarity (0.0 to 1.0)

Parameters:

  • str_a (String)
  • str_b (String)

Returns:

  • (Float)


62
63
64
65
66
67
68
69
70
# File 'lib/philiprehberger/fuzzy_match.rb', line 62

def self.damerau_ratio(str_a, str_b)
  a = str_a.to_s.downcase
  b = str_b.to_s.downcase
  max_len = [a.length, b.length].max
  return 1.0 if max_len.zero?

  distance = DamerauLevenshtein.distance(a, b)
  1.0 - (distance.to_f / max_len)
end

.deduplicate(array, threshold: 0.8, algorithm: :jaro_winkler) ⇒ Array<String>

Group and deduplicate similar strings

Parameters:

  • array (Array<String>)

    strings to deduplicate

  • threshold (Float) (defaults to: 0.8)

    similarity threshold (0.0 to 1.0)

  • algorithm (Symbol) (defaults to: :jaro_winkler)

    :jaro_winkler (default), :dice, or :levenshtein

Returns:

  • (Array<String>)

    unique representatives



256
257
258
259
260
261
262
263
264
265
# File 'lib/philiprehberger/fuzzy_match.rb', line 256

def self.deduplicate(array, threshold: 0.8, algorithm: :jaro_winkler)
  representatives = []
  array.each do |item|
    duplicate = representatives.any? do |rep|
      score_for(rep, item, algorithm: algorithm) >= threshold
    end
    representatives << item unless duplicate
  end
  representatives
end

.dice_coefficient(str_a, str_b) ⇒ Object



26
27
28
# File 'lib/philiprehberger/fuzzy_match.rb', line 26

def self.dice_coefficient(str_a, str_b)
  Dice.coefficient(str_a, str_b)
end

.hamming(str_a, str_b) ⇒ Integer

Hamming distance for equal-length strings

Parameters:

  • str_a (String)
  • str_b (String)

Returns:

  • (Integer)

Raises:

  • (Error)

    if strings have different lengths



175
176
177
# File 'lib/philiprehberger/fuzzy_match.rb', line 175

def self.hamming(str_a, str_b)
  Hamming.distance(str_a, str_b)
end

.jaro_winkler(str_a, str_b) ⇒ Object



22
23
24
# File 'lib/philiprehberger/fuzzy_match.rb', line 22

def self.jaro_winkler(str_a, str_b)
  JaroWinkler.similarity(str_a, str_b)
end

.lcs(str_a, str_b) ⇒ Integer

Length of the longest common subsequence

Parameters:

  • str_a (String)
  • str_b (String)

Returns:

  • (Integer)


35
36
37
# File 'lib/philiprehberger/fuzzy_match.rb', line 35

def self.lcs(str_a, str_b)
  Lcs.length(str_a, str_b)
end

.lcs_ratio(str_a, str_b) ⇒ Float

Normalized LCS similarity (0.0 to 1.0)

Parameters:

  • str_a (String)
  • str_b (String)

Returns:

  • (Float)


44
45
46
# File 'lib/philiprehberger/fuzzy_match.rb', line 44

def self.lcs_ratio(str_a, str_b)
  Lcs.ratio(str_a, str_b)
end

.levenshtein(str_a, str_b) ⇒ Object



18
19
20
# File 'lib/philiprehberger/fuzzy_match.rb', line 18

def self.levenshtein(str_a, str_b)
  Levenshtein.distance(str_a, str_b)
end

.metaphone(string) ⇒ String

Generate a Metaphone code for a string

Parameters:

  • string (String)

Returns:

  • (String)

    Metaphone phonetic code



154
155
156
# File 'lib/philiprehberger/fuzzy_match.rb', line 154

def self.metaphone(string)
  Metaphone.code(string)
end

.phonetic_match?(a, b) ⇒ Boolean

Check if two strings match phonetically (same Soundex code)

Parameters:

  • a (String)
  • b (String)

Returns:

  • (Boolean)


163
164
165
166
167
# File 'lib/philiprehberger/fuzzy_match.rb', line 163

def self.phonetic_match?(a, b)
  sa = Soundex.code(a)
  sb = Soundex.code(b)
  !sa.empty? && sa == sb
end

.rank(query, candidates, algorithm: :jaro_winkler) ⇒ Array<Hash>

Rank all candidates by similarity to the query (descending)

Parameters:

  • query (String)
  • candidates (Array<String>)
  • algorithm (Symbol) (defaults to: :jaro_winkler)

    :jaro_winkler (default), :dice, or :levenshtein

Returns:

  • (Array<Hash>)

    all candidates as ‘{ value:, score: }` sorted by score desc, ties broken by original input order (stable)



123
124
125
126
127
128
# File 'lib/philiprehberger/fuzzy_match.rb', line 123

def self.rank(query, candidates, algorithm: :jaro_winkler)
  scored = candidates.each_with_index.map do |candidate, index|
    [{ value: candidate, score: score_for(query, candidate.to_s, algorithm: algorithm) }, index]
  end
  scored.sort_by { |pair, index| [-pair[:score], index] }.map(&:first)
end

.ratio(str_a, str_b) ⇒ Object



72
73
74
75
76
77
78
79
80
# File 'lib/philiprehberger/fuzzy_match.rb', line 72

def self.ratio(str_a, str_b)
  a = str_a.to_s.downcase
  b = str_b.to_s.downcase
  max_len = [a.length, b.length].max
  return 1.0 if max_len.zero?

  distance = Levenshtein.distance(a, b)
  1.0 - (distance.to_f / max_len)
end

.search(query, candidates, threshold: 0.3) ⇒ Object



101
102
103
104
105
106
107
108
109
# File 'lib/philiprehberger/fuzzy_match.rb', line 101

def self.search(query, candidates, threshold: 0.3)
  scored = candidates.map do |candidate|
    score = ratio(query, candidate.to_s)
    { match: candidate, score: score.round(4) }
  end

  results = scored.select { |r| r[:score] >= threshold }
  results.sort_by { |r| -r[:score] }
end

.similarity_matrix(strings, algorithm: :jaro_winkler, threshold: nil) ⇒ Object



267
268
269
270
271
272
273
274
275
276
277
# File 'lib/philiprehberger/fuzzy_match.rb', line 267

def self.similarity_matrix(strings, algorithm: :jaro_winkler, threshold: nil)
  matrix = {}
  strings.each do |a|
    matrix[a] = {}
    strings.each do |b|
      score = score_for(a, b, algorithm: algorithm).round(4)
      matrix[a][b] = score if threshold.nil? || score >= threshold
    end
  end
  matrix
end

.soundex(string) ⇒ String

Generate a Soundex code for a string

Parameters:

  • string (String)

Returns:

  • (String)

    4-character Soundex code



146
147
148
# File 'lib/philiprehberger/fuzzy_match.rb', line 146

def self.soundex(string)
  Soundex.code(string)
end

.suggest(query, candidates, threshold: 0.6, max: 5) ⇒ Object



111
112
113
114
# File 'lib/philiprehberger/fuzzy_match.rb', line 111

def self.suggest(query, candidates, threshold: 0.6, max: 5)
  results = search(query, candidates, threshold: threshold)
  results.first(max).map { |r| r[:match] }
end

.token_set_ratio(str_a, str_b) ⇒ Float

Token-set ratio: compare token sets using Jaccard-like string similarity

Parameters:

  • str_a (String)
  • str_b (String)

Returns:

  • (Float)

    similarity between 0.0 and 1.0



195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
# File 'lib/philiprehberger/fuzzy_match.rb', line 195

def self.token_set_ratio(str_a, str_b)
  tokens_a = str_a.to_s.downcase.split.uniq.sort
  tokens_b = str_b.to_s.downcase.split.uniq.sort

  return 1.0 if tokens_a == tokens_b
  return 0.0 if tokens_a.empty? && tokens_b.empty?

  intersection = tokens_a & tokens_b
  combined = (tokens_a | tokens_b).sort

  common = intersection.join(' ')
  rest_a = (tokens_a - intersection).sort.join(' ')
  rest_b = (tokens_b - intersection).sort.join(' ')

  joined_a = [common, rest_a].reject(&:empty?).join(' ')
  joined_b = [common, rest_b].reject(&:empty?).join(' ')
  joined_full = combined.join(' ')

  scores = [
    JaroWinkler.similarity(joined_a, joined_b),
    JaroWinkler.similarity(common, joined_a),
    JaroWinkler.similarity(common, joined_b),
    JaroWinkler.similarity(common, joined_full)
  ]

  scores.compact.max || 0.0
end

.token_sort_ratio(str_a, str_b) ⇒ Float

Token-sort ratio: sort tokens alphabetically, then compute Jaro-Winkler similarity

Parameters:

  • str_a (String)
  • str_b (String)

Returns:

  • (Float)

    similarity between 0.0 and 1.0



184
185
186
187
188
# File 'lib/philiprehberger/fuzzy_match.rb', line 184

def self.token_sort_ratio(str_a, str_b)
  a = str_a.to_s.downcase.split.sort.join(' ')
  b = str_b.to_s.downcase.split.sort.join(' ')
  JaroWinkler.similarity(a, b)
end

.weighted_score(str_a, str_b, weights:) ⇒ Float

Weighted score combining multiple algorithms

Parameters:

  • str_a (String)
  • str_b (String)
  • weights (Hash)

    algorithm => weight pairs (must sum to 1.0)

Returns:

  • (Float)

    weighted similarity score

Raises:

  • (Error)

    if weights do not sum to 1.0



230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
# File 'lib/philiprehberger/fuzzy_match.rb', line 230

def self.weighted_score(str_a, str_b, weights:)
  sum = weights.values.reduce(0.0, :+)
  raise Error, "Weights must sum to 1.0, got #{sum}" unless (sum - 1.0).abs < 1e-9

  algorithm_map = {
    jaro_winkler: ->(a, b) { jaro_winkler(a, b) },
    dice: ->(a, b) { dice_coefficient(a, b) },
    levenshtein_ratio: ->(a, b) { ratio(a, b) },
    lcs_ratio: ->(a, b) { lcs_ratio(a, b) },
    damerau_ratio: ->(a, b) { damerau_ratio(a, b) }
  }

  weights.reduce(0.0) do |total, (algo, weight)|
    fn = algorithm_map[algo]
    raise Error, "Unknown algorithm: #{algo}" unless fn

    total + (fn.call(str_a, str_b) * weight)
  end
end