Module: Philiprehberger::FuzzyMatch

Defined in:
lib/philiprehberger/fuzzy_match.rb,
lib/philiprehberger/fuzzy_match/lcs.rb,
lib/philiprehberger/fuzzy_match/dice.rb,
lib/philiprehberger/fuzzy_match/hamming.rb,
lib/philiprehberger/fuzzy_match/soundex.rb,
lib/philiprehberger/fuzzy_match/version.rb,
lib/philiprehberger/fuzzy_match/metaphone.rb,
lib/philiprehberger/fuzzy_match/levenshtein.rb,
lib/philiprehberger/fuzzy_match/jaro_winkler.rb,
lib/philiprehberger/fuzzy_match/damerau_levenshtein.rb

Defined Under Namespace

Modules: DamerauLevenshtein, Dice, Hamming, JaroWinkler, Lcs, Levenshtein, Metaphone, Soundex Classes: Error

Constant Summary collapse

VERSION =
'0.6.0'

Class Method Summary collapse

Class Method Details

.best(query, candidates, threshold: 0.0) ⇒ Object



82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# File 'lib/philiprehberger/fuzzy_match.rb', line 82

def self.best(query, candidates, threshold: 0.0)
  return nil if candidates.empty?

  best_result = nil
  best_score = -1.0

  candidates.each do |candidate|
    score = ratio(query, candidate.to_s)
    if score > best_score
      best_result = candidate
      best_score = score
    end
  end

  return nil if best_score < threshold

  { match: best_result, score: best_score.round(4) }
end

.damerau_levenshtein(str_a, str_b) ⇒ Integer

Damerau-Levenshtein edit distance (counts transpositions as 1 edit)

Parameters:

  • str_a (String)
  • str_b (String)

Returns:

  • (Integer)


53
54
55
# File 'lib/philiprehberger/fuzzy_match.rb', line 53

def self.damerau_levenshtein(str_a, str_b)
  DamerauLevenshtein.distance(str_a, str_b)
end

.damerau_ratio(str_a, str_b) ⇒ Float

Normalized Damerau-Levenshtein similarity (0.0 to 1.0)

Parameters:

  • str_a (String)
  • str_b (String)

Returns:

  • (Float)


62
63
64
65
66
67
68
69
70
# File 'lib/philiprehberger/fuzzy_match.rb', line 62

def self.damerau_ratio(str_a, str_b)
  a = str_a.to_s.downcase
  b = str_b.to_s.downcase
  max_len = [a.length, b.length].max
  return 1.0 if max_len.zero?

  distance = DamerauLevenshtein.distance(a, b)
  1.0 - (distance.to_f / max_len)
end

.deduplicate(array, threshold: 0.8, algorithm: :jaro_winkler) ⇒ Array<String>

Group and deduplicate similar strings

Parameters:

  • array (Array<String>)

    strings to deduplicate

  • threshold (Float) (defaults to: 0.8)

    similarity threshold (0.0 to 1.0)

  • algorithm (Symbol) (defaults to: :jaro_winkler)

    :jaro_winkler (default), :dice, or :levenshtein

Returns:

  • (Array<String>)

    unique representatives



244
245
246
247
248
249
250
251
252
253
# File 'lib/philiprehberger/fuzzy_match.rb', line 244

def self.deduplicate(array, threshold: 0.8, algorithm: :jaro_winkler)
  representatives = []
  array.each do |item|
    duplicate = representatives.any? do |rep|
      score_for(rep, item, algorithm: algorithm) >= threshold
    end
    representatives << item unless duplicate
  end
  representatives
end

.dice_coefficient(str_a, str_b) ⇒ Object



26
27
28
# File 'lib/philiprehberger/fuzzy_match.rb', line 26

def self.dice_coefficient(str_a, str_b)
  Dice.coefficient(str_a, str_b)
end

.hamming(str_a, str_b) ⇒ Integer

Hamming distance for equal-length strings

Parameters:

  • str_a (String)
  • str_b (String)

Returns:

  • (Integer)

Raises:

  • (Error)

    if strings have different lengths



163
164
165
# File 'lib/philiprehberger/fuzzy_match.rb', line 163

def self.hamming(str_a, str_b)
  Hamming.distance(str_a, str_b)
end

.jaro_winkler(str_a, str_b) ⇒ Object



22
23
24
# File 'lib/philiprehberger/fuzzy_match.rb', line 22

def self.jaro_winkler(str_a, str_b)
  JaroWinkler.similarity(str_a, str_b)
end

.lcs(str_a, str_b) ⇒ Integer

Length of the longest common subsequence

Parameters:

  • str_a (String)
  • str_b (String)

Returns:

  • (Integer)


35
36
37
# File 'lib/philiprehberger/fuzzy_match.rb', line 35

def self.lcs(str_a, str_b)
  Lcs.length(str_a, str_b)
end

.lcs_ratio(str_a, str_b) ⇒ Float

Normalized LCS similarity (0.0 to 1.0)

Parameters:

  • str_a (String)
  • str_b (String)

Returns:

  • (Float)


44
45
46
# File 'lib/philiprehberger/fuzzy_match.rb', line 44

def self.lcs_ratio(str_a, str_b)
  Lcs.ratio(str_a, str_b)
end

.levenshtein(str_a, str_b) ⇒ Object



18
19
20
# File 'lib/philiprehberger/fuzzy_match.rb', line 18

def self.levenshtein(str_a, str_b)
  Levenshtein.distance(str_a, str_b)
end

.metaphone(string) ⇒ String

Generate a Metaphone code for a string

Parameters:

  • string (String)

Returns:

  • (String)

    Metaphone phonetic code



142
143
144
# File 'lib/philiprehberger/fuzzy_match.rb', line 142

def self.metaphone(string)
  Metaphone.code(string)
end

.phonetic_match?(a, b) ⇒ Boolean

Check if two strings match phonetically (same Soundex code)

Parameters:

  • a (String)
  • b (String)

Returns:

  • (Boolean)


151
152
153
154
155
# File 'lib/philiprehberger/fuzzy_match.rb', line 151

def self.phonetic_match?(a, b)
  sa = Soundex.code(a)
  sb = Soundex.code(b)
  !sa.empty? && sa == sb
end

.rank(query, candidates, algorithm: :jaro_winkler) ⇒ Array<Hash>

Rank all candidates by similarity to the query (descending)

Parameters:

  • query (String)
  • candidates (Array<String>)
  • algorithm (Symbol) (defaults to: :jaro_winkler)

    :jaro_winkler (default), :dice, or :levenshtein

Returns:

  • (Array<Hash>)

    all candidates as ‘{ value:, score: }` sorted by score desc, ties broken by original input order (stable)



123
124
125
126
127
128
# File 'lib/philiprehberger/fuzzy_match.rb', line 123

def self.rank(query, candidates, algorithm: :jaro_winkler)
  scored = candidates.each_with_index.map do |candidate, index|
    [{ value: candidate, score: score_for(query, candidate.to_s, algorithm: algorithm) }, index]
  end
  scored.sort_by { |pair, index| [-pair[:score], index] }.map(&:first)
end

.ratio(str_a, str_b) ⇒ Object



72
73
74
75
76
77
78
79
80
# File 'lib/philiprehberger/fuzzy_match.rb', line 72

def self.ratio(str_a, str_b)
  a = str_a.to_s.downcase
  b = str_b.to_s.downcase
  max_len = [a.length, b.length].max
  return 1.0 if max_len.zero?

  distance = Levenshtein.distance(a, b)
  1.0 - (distance.to_f / max_len)
end

.search(query, candidates, threshold: 0.3) ⇒ Object



101
102
103
104
105
106
107
108
109
# File 'lib/philiprehberger/fuzzy_match.rb', line 101

def self.search(query, candidates, threshold: 0.3)
  scored = candidates.map do |candidate|
    score = ratio(query, candidate.to_s)
    { match: candidate, score: score.round(4) }
  end

  results = scored.select { |r| r[:score] >= threshold }
  results.sort_by { |r| -r[:score] }
end

.soundex(string) ⇒ String

Generate a Soundex code for a string

Parameters:

  • string (String)

Returns:

  • (String)

    4-character Soundex code



134
135
136
# File 'lib/philiprehberger/fuzzy_match.rb', line 134

def self.soundex(string)
  Soundex.code(string)
end

.suggest(query, candidates, threshold: 0.6, max: 5) ⇒ Object



111
112
113
114
# File 'lib/philiprehberger/fuzzy_match.rb', line 111

def self.suggest(query, candidates, threshold: 0.6, max: 5)
  results = search(query, candidates, threshold: threshold)
  results.first(max).map { |r| r[:match] }
end

.token_set_ratio(str_a, str_b) ⇒ Float

Token-set ratio: compare token sets using Jaccard-like string similarity

Parameters:

  • str_a (String)
  • str_b (String)

Returns:

  • (Float)

    similarity between 0.0 and 1.0



183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
# File 'lib/philiprehberger/fuzzy_match.rb', line 183

def self.token_set_ratio(str_a, str_b)
  tokens_a = str_a.to_s.downcase.split.uniq.sort
  tokens_b = str_b.to_s.downcase.split.uniq.sort

  return 1.0 if tokens_a == tokens_b
  return 0.0 if tokens_a.empty? && tokens_b.empty?

  intersection = tokens_a & tokens_b
  combined = (tokens_a | tokens_b).sort

  common = intersection.join(' ')
  rest_a = (tokens_a - intersection).sort.join(' ')
  rest_b = (tokens_b - intersection).sort.join(' ')

  joined_a = [common, rest_a].reject(&:empty?).join(' ')
  joined_b = [common, rest_b].reject(&:empty?).join(' ')
  joined_full = combined.join(' ')

  scores = [
    JaroWinkler.similarity(joined_a, joined_b),
    JaroWinkler.similarity(common, joined_a),
    JaroWinkler.similarity(common, joined_b),
    JaroWinkler.similarity(common, joined_full)
  ]

  scores.compact.max || 0.0
end

.token_sort_ratio(str_a, str_b) ⇒ Float

Token-sort ratio: sort tokens alphabetically, then compute Jaro-Winkler similarity

Parameters:

  • str_a (String)
  • str_b (String)

Returns:

  • (Float)

    similarity between 0.0 and 1.0



172
173
174
175
176
# File 'lib/philiprehberger/fuzzy_match.rb', line 172

def self.token_sort_ratio(str_a, str_b)
  a = str_a.to_s.downcase.split.sort.join(' ')
  b = str_b.to_s.downcase.split.sort.join(' ')
  JaroWinkler.similarity(a, b)
end

.weighted_score(str_a, str_b, weights:) ⇒ Float

Weighted score combining multiple algorithms

Parameters:

  • str_a (String)
  • str_b (String)
  • weights (Hash)

    algorithm => weight pairs (must sum to 1.0)

Returns:

  • (Float)

    weighted similarity score

Raises:

  • (Error)

    if weights do not sum to 1.0



218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
# File 'lib/philiprehberger/fuzzy_match.rb', line 218

def self.weighted_score(str_a, str_b, weights:)
  sum = weights.values.reduce(0.0, :+)
  raise Error, "Weights must sum to 1.0, got #{sum}" unless (sum - 1.0).abs < 1e-9

  algorithm_map = {
    jaro_winkler: ->(a, b) { jaro_winkler(a, b) },
    dice: ->(a, b) { dice_coefficient(a, b) },
    levenshtein_ratio: ->(a, b) { ratio(a, b) },
    lcs_ratio: ->(a, b) { lcs_ratio(a, b) },
    damerau_ratio: ->(a, b) { damerau_ratio(a, b) }
  }

  weights.reduce(0.0) do |total, (algo, weight)|
    fn = algorithm_map[algo]
    raise Error, "Unknown algorithm: #{algo}" unless fn

    total + (fn.call(str_a, str_b) * weight)
  end
end