Module: SqlChatbot::Grammar::EntityCandidates

Defined in:
lib/sql_chatbot/grammar/entity_candidates.rb

Class Method Summary collapse

Class Method Details

.levenshtein(a, b) ⇒ Object

Damerau-Levenshtein edit distance: insertions, deletions, substitutions, and adjacent transpositions. Transposition counted as 1 (vs 2 in plain Levenshtein) because keyboard typos like “lables” ↔ “labels” are extremely common and should match at distance 1.



102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# File 'lib/sql_chatbot/grammar/entity_candidates.rb', line 102

def self.levenshtein(a, b)
  return 0 if a == b
  return b.length if a.empty?
  return a.length if b.empty?
  n = b.length
  prev2 = Array.new(n + 1, 0)
  prev = (0..n).to_a
  curr = Array.new(n + 1, 0)
  (1..a.length).each do |i|
    curr[0] = i
    (1..n).each do |j|
      cost = (a[i - 1] == b[j - 1]) ? 0 : 1
      v = [prev[j] + 1, curr[j - 1] + 1, prev[j - 1] + cost].min
      if i > 1 && j > 1 && a[i - 1] == b[j - 2] && a[i - 2] == b[j - 1]
        v = [v, prev2[j - 2] + 1].min
      end
      curr[j] = v
    end
    prev2, prev, curr = prev, curr, prev2
  end
  prev[n]
end

.name_segments(entity) ⇒ Object



131
132
133
# File 'lib/sql_chatbot/grammar/entity_candidates.rb', line 131

def self.name_segments(entity)
  entity.name.to_s.split("_").length
end

.pluralize_simple(word) ⇒ Object



125
126
127
128
129
# File 'lib/sql_chatbot/grammar/entity_candidates.rb', line 125

def self.pluralize_simple(word)
  return word + "es" if word.end_with?("s", "x", "ch", "sh")
  return word[0..-2] + "ies" if word.end_with?("y") && !%w[a e i o u].include?(word[-2])
  word + "s"
end

.score_entity(question, entity, registry) ⇒ Object

Score an entity against the question. Tokenizes name on ‘_’ so e.g. ‘projects_project` token “project” matches the question “how many projects”. Tie-breakers: fewer name segments, then higher row count.



12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# File 'lib/sql_chatbot/grammar/entity_candidates.rb', line 12

def self.score_entity(question, entity, registry)
  q = question.to_s.downcase
  singular = entity.name.to_s.downcase
  plural = entity.table.to_s.downcase
  score = 0

  score += 12 if q.include?(" #{singular} ") || q.start_with?("#{singular} ") || q.end_with?(" #{singular}")
  score += 10 if q.include?(plural)
  score += 5 if q.include?(singular)

  registry.aliases.each do |alias_term, target|
    next unless target == entity.name
    score += 8 if q.include?(alias_term.to_s.downcase)
  end

  # Token-level matching for compound names like `projects_project`.
  tokens = singular.split("_").select { |t| t.length >= 3 }
  tokens.each do |tok|
    tok_plural = pluralize_simple(tok)
    if q =~ /\b(#{Regexp.escape(tok)}|#{Regexp.escape(tok_plural)})\b/
      score += 4
    end
  end

  # Whitespace-collapsed match — length-weighted so longer matches win.
  q_compact = q.gsub(/\s+/, "")
  best_len = 0
  tokens.each do |tok|
    next if tok.length < 5
    [tok, pluralize_simple(tok)].each do |c|
      best_len = c.length if q_compact.include?(c) && c.length > best_len
    end
  end
  score += best_len

  # Fuzzy match for typos. Two-tier — aliases first (strongest signal,
  # same tier as exact-alias just approximate), then name/plural/tokens
  # (weaker — tokens are shared across many entities in compound-named
  # schemas like Saleor's `product_product` / `product_category`).
  # 4-char threshold catches "usrs" → "users" while rejecting 3-char
  # noise ("lon" → "log"). Distance ≤ 25% of length further filters.
  fuzzy_min_len = 4
  if score == 0
    alias_targets = []
    registry.aliases.each do |alias_term, target|
      next unless target == entity.name
      alias_targets << alias_term.to_s.downcase if alias_term.to_s.length >= fuzzy_min_len
    end
    name_targets = []
    name_targets << singular if singular.length >= fuzzy_min_len
    name_targets << plural   if plural.length >= fuzzy_min_len
    tokens.each do |tok|
      next if tok.length < fuzzy_min_len
      name_targets << tok
      tp = pluralize_simple(tok)
      name_targets << tp if tp.length >= fuzzy_min_len
    end

    words = q.split(/\W+/).select { |w| w.length >= fuzzy_min_len }
    # Aliases first — score 5
    words.each do |word|
      alias_targets.each do |t|
        d = levenshtein(word, t)
        next if d == 0
        max = [word.length, t.length].max
        if d <= 2 && d.to_f / max <= 0.25
          return [5, { typed: word, corrected: t }]
        end
      end
    end
    # Then name/tokens — score 3
    words.each do |word|
      name_targets.each do |t|
        d = levenshtein(word, t)
        next if d == 0
        max = [word.length, t.length].max
        if d <= 2 && d.to_f / max <= 0.25
          return [3, { typed: word, corrected: t }]
        end
      end
    end
  end

  [score, nil]
end

.select(question:, registry:, top_n:) ⇒ Object



135
136
137
# File 'lib/sql_chatbot/grammar/entity_candidates.rb', line 135

def self.select(question:, registry:, top_n:)
  select_with_meta(question: question, registry: registry, top_n: top_n).map { |row| row[:entity] }
end

.select_with_meta(question:, registry:, top_n:) ⇒ Object

Returns rows of ‘{ entity:, score:, fuzzy_match: nil|typed:,corrected: }`. Used by the intent-extractor prompt to tell the LLM “the user word `<typo>` is likely a typo of `<entity>`” so it commits to the candidate instead of returning unmatched on a stray typo.



143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
# File 'lib/sql_chatbot/grammar/entity_candidates.rb', line 143

def self.select_with_meta(question:, registry:, top_n:)
  rows = registry.entities.values.map do |entity|
    score, fuzzy = score_entity(question, entity, registry)
    {
      entity: entity,
      score: score,
      fuzzy_match: fuzzy,
      segments: name_segments(entity),
      row_count: entity.row_count,
    }
  end

  rows.sort_by! { |r| [-r[:score], r[:segments], -r[:row_count]] }

  if rows.first && rows.first[:score] == 0
    return registry.entities.values.sort_by { |e| -e.row_count }.first(top_n).map do |entity|
      { entity: entity, score: 0, fuzzy_match: nil }
    end
  end

  # When a typed word has a strong alias-fuzzy resolution (score 5),
  # drop OTHER candidates that scored only via a weaker token-fuzzy
  # match on the same typed word — they represent unrelated tables
  # whose presence in the prompt tempts the LLM to override the
  # resolution.
  strong = {}
  rows.each do |r|
    if r[:fuzzy_match] && r[:score] == 5
      strong[r[:fuzzy_match][:typed]] = true
    end
  end
  filtered =
    if strong.empty?
      rows
    else
      rows.reject { |r| r[:fuzzy_match] && r[:score] == 3 && strong[r[:fuzzy_match][:typed]] }
    end

  # V1.3-R: alternate-suppression when a primary clearly dominates.
  # Mirror of TS selectEntityCandidatesWithMeta. When the top entity
  # has a strong alias/name match (score ≥ 8), drop alternates whose
  # score is at most half the primary's. Keeps the LLM focused on the
  # right binding instead of weighing distractor token matches.
  primary_dominance_floor = 8
  alternate_keep_ratio = 0.5
  top = filtered.first
  if top && top[:score] >= primary_dominance_floor
    cutoff = top[:score] * alternate_keep_ratio
    filtered = filtered.each_with_index.reject { |r, i| i > 0 && r[:score] <= cutoff }.map(&:first)
  end

  # Dedup fuzzy_match annotation per typed word.
  claimed = {}
  filtered.first(top_n).map do |r|
    fuzzy = r[:fuzzy_match]
    if fuzzy
      if claimed[fuzzy[:typed]]
        fuzzy = nil
      else
        claimed[fuzzy[:typed]] = true
      end
    end
    { entity: r[:entity], score: r[:score], fuzzy_match: fuzzy }
  end
end