Class: SqlChatbot::Services::SchemaService

Inherits:
Object
  • Object
show all
Defined in:
lib/sql_chatbot/services/schema_service.rb

Constant Summary collapse

SOFT_DELETE_COLUMNS =

Columns that indicate soft-delete patterns

%w[deleted_at discarded_at archived_at removed_at].freeze
SENSITIVE_PATTERNS =

Word-boundary patterns for sensitive columns — must match as whole “word segments” separated by underscores or string boundaries, to avoid false positives like “pinned_at” matching “pin”.

%w[
  password passwd secret token ssn social_security
  credit_card card_number cvv pin encrypted hash
  salt private_key api_key auth_key access_key
].freeze
TYPE_MAP =

Maps PostgreSQL data_type strings to concise labels used in the schema summary

{
  "character varying"            => "VARCHAR",
  "integer"                      => "INT",
  "bigint"                       => "BIGINT",
  "smallint"                     => "SMALLINT",
  "timestamp without time zone"  => "TIMESTAMP",
  "timestamp with time zone"     => "TIMESTAMPTZ",
  "numeric"                      => "DECIMAL",
  "boolean"                      => "BOOL",
  "text"                         => "TEXT",
  "date"                         => "DATE",
  "double precision"             => "DOUBLE",
  "real"                         => "REAL",
  "uuid"                         => "UUID",
  "jsonb"                        => "JSONB",
  "json"                         => "JSON",
}.freeze

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeSchemaService

Returns a new instance of SchemaService.



63
64
65
66
67
68
69
# File 'lib/sql_chatbot/services/schema_service.rb', line 63

def initialize
  @summary_text = ""
  @tables = []
  @per_table_schemas = {}
  @table_index = {}
  @fk_graph = {}
end

Instance Attribute Details

#table_countObject (readonly)


Instance




61
62
63
# File 'lib/sql_chatbot/services/schema_service.rb', line 61

def table_count
  @table_count
end

Class Method Details

.map_type(pg_type) ⇒ Object

Map a PostgreSQL data_type to a concise label; unknown types are uppercased.



53
54
55
# File 'lib/sql_chatbot/services/schema_service.rb', line 53

def self.map_type(pg_type)
  TYPE_MAP[pg_type] || pg_type.upcase
end

.sensitive?(column_name) ⇒ Boolean

Returns true if the column name matches any sensitive pattern using word-boundary matching: pattern must appear between start-of-string / underscore boundaries. This avoids false positives like “pinned_at” matching “pin”.

Returns:

  • (Boolean)


45
46
47
48
49
50
# File 'lib/sql_chatbot/services/schema_service.rb', line 45

def self.sensitive?(column_name)
  lower = column_name.downcase
  SENSITIVE_PATTERNS.any? do |pattern|
    lower.match?(/(?:^|_)#{Regexp.escape(pattern)}(?:$|_)/)
  end
end

Instance Method Details

#append_model_annotations(annotations_by_table) ⇒ Object

Inject model-level annotations (from ModelIntrospector) into the schema summary. annotations_by_table: Hash of table_name => [annotation_strings] Each annotation is inserted after the TABLE line and any existing annotations.



225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
# File 'lib/sql_chatbot/services/schema_service.rb', line 225

def append_model_annotations(annotations_by_table)
  return if annotations_by_table.nil? || annotations_by_table.empty?

  lines = @summary_text.split("\n")
  result = []
  current_table = nil

  lines.each do |line|
    if line.start_with?("TABLE ")
      # Before moving to next table, flush pending annotations for previous table
      if current_table && annotations_by_table.key?(current_table)
        annotations_by_table[current_table].each { |ann| result << ann }
      end
      current_table = line.match(/^TABLE (\S+)/)[1]
    end
    result << line
  end

  # Flush annotations for the last table
  if current_table && annotations_by_table.key?(current_table)
    annotations_by_table[current_table].each { |ann| result << ann }
  end

  @summary_text = result.join("\n")

  # Also update per-table schemas so select_schema() includes the annotations
  annotations_by_table.each do |table, annotations|
    next unless @per_table_schemas.key?(table)
    annotations.each do |ann|
      @per_table_schemas[table] += "\n#{ann}"
    end
  end
end

#apply_soft_delete_annotations(soft_delete_tables:, enum_soft_delete_tables:) ⇒ Object

Apply soft delete annotations conditionally based on model introspection results.

  • Tables using a soft delete gem (paranoia, discard): always add SOFT DELETE annotation

  • Tables with enum soft delete but no gem: suppress SOFT DELETE (enum is the real mechanism)

  • Tables with neither: add SOFT DELETE annotation (assume column is used)



485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
# File 'lib/sql_chatbot/services/schema_service.rb', line 485

def apply_soft_delete_annotations(soft_delete_tables:, enum_soft_delete_tables:)
  return if @deferred_soft_deletes.nil? || @deferred_soft_deletes.empty?

  new_annotations = {}
  @deferred_soft_deletes.each do |table, columns|
    if soft_delete_tables.include?(table)
      # Gem manages this column — keep the annotation
      columns.each do |col|
        (new_annotations[table] ||= []) << "  -- SOFT DELETE: filter #{col} IS NULL for active records"
      end
    elsif enum_soft_delete_tables.include?(table)
      # Enum is the real soft delete, column is likely unused — suppress
      next
    else
      # No competing mechanism — assume column is used
      columns.each do |col|
        (new_annotations[table] ||= []) << "  -- SOFT DELETE: filter #{col} IS NULL for active records"
      end
    end
  end

  append_model_annotations(new_annotations) unless new_annotations.empty?
end

#discoverObject

Introspect the database and build a schema summary string with enrichment annotations (soft delete, polymorphic, lookup values, enums, check constraints). Requires ActiveRecord::Base.connection to be available.



262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
# File 'lib/sql_chatbot/services/schema_service.rb', line 262

def discover
  conn = ActiveRecord::Base.connection

  # Run all introspection queries
  table_names   = query_tables(conn)
  columns_rows  = query_columns(conn)
  pk_rows       = query_primary_keys(conn)
  fk_rows       = query_foreign_keys(conn)
  enum_rows     = query_enums(conn)
  check_rows    = query_check_constraints(conn)

  # Index primary keys: Set of "table.column"
  pk_set = Set.new(pk_rows.map { |r| "#{r['table_name']}.#{r['column_name']}" })

  # Index foreign keys: Hash of "from_table.from_column" => "to_table.to_column"
  fk_map = fk_rows.each_with_object({}) do |r, h|
    h["#{r['from_table']}.#{r['from_column']}"] = "#{r['to_table']}.#{r['to_column']}"
  end

  # Index enum types: enum_name => [ordered values]
  enum_map = enum_rows.each_with_object({}) do |r, h|
    (h[r["enum_name"]] ||= []) << r["enum_value"]
  end

  # Parse check constraints for IN (...) or ANY(ARRAY[...]) patterns
  check_enum_map = {}
  check_rows.each do |r|
    result = parse_check_constraint(r["check_def"])
    next unless result

    col_name, values = result
    check_enum_map["#{r['table_name']}.#{col_name}"] = values
  end

  # Group columns by table
  columns_by_table = columns_rows.each_with_object({}) do |col, h|
    (h[col["table_name"]] ||= []) << col
  end

  # Collect FK target tables for lookup value detection
  fk_target_tables = Set.new(fk_rows.map { |r| r["to_table"] })

  # Convention-based references: *_id columns => plural table names
  table_name_set = Set.new(table_names)
  columns_by_table.each_value do |cols|
    cols.each do |col|
      col_name = col["column_name"]
      next unless col_name.end_with?("_id")
      next if pk_set.include?("#{col['table_name']}.#{col_name}")

      base = col_name[0..-4] # remove '_id'
      candidates = [
        "#{base}s",
        "#{base.sub(/y$/, 'ie')}s",
        "#{base}es",
        base,
      ]
      candidates.each do |candidate|
        if table_name_set.include?(candidate)
          fk_target_tables.add(candidate)
          break
        end
      end
    end
  end

  # Discover lookup values for small referenced tables
  lookup_values = discover_lookup_values(conn, fk_target_tables, columns_by_table, pk_set)

  # Get approximate row counts for all tables (helps LLM distinguish data vs config tables)
  row_counts = query_row_counts(conn)

  # Build summary lines
  lines = []
  table_names.each do |table|
    columns = columns_by_table[table] || []
    col_parts = []
    annotations = []
    col_name_types = {} # column_name => mapped_type (for polymorphic detection)

    columns.each do |col|
      next if self.class.sensitive?(col["column_name"])

      key = "#{table}.#{col['column_name']}"

      # Resolve enum values: PG native enum or check-constraint enum
      enum_values = if col["data_type"] == "USER-DEFINED" && col["udt_name"]
                     enum_map[col["udt_name"]]
                   end
      enum_values ||= check_enum_map[key]

      mapped_type = if enum_values
                     "ENUM(#{enum_values.join(',')})"
                   else
                     self.class.map_type(col["data_type"])
                   end

      part = "#{col['column_name']} #{mapped_type}"
      part += " PK" if pk_set.include?(key)
      part += " FK=>#{fk_map[key]}" if fk_map.key?(key)

      col_parts << part
      col_name_types[col["column_name"]] = mapped_type

      # Defer soft delete annotation (applied after model introspection)
      if SOFT_DELETE_COLUMNS.include?(col["column_name"])
        (@deferred_soft_deletes ||= {})[table] ||= []
        @deferred_soft_deletes[table] << col["column_name"]
      end

      # Enum value annotation
      if enum_values
        annotations << "  -- ENUM: #{col['column_name']} values: #{enum_values.join(', ')}"
      end
    end

    # Polymorphic association detection
    col_name_types.each do |col_name, col_type|
      next unless col_name.end_with?("_type") && %w[VARCHAR TEXT].include?(col_type)

      prefix = col_name[0..-6] # remove '_type'
      id_col = "#{prefix}_id"
      id_type = col_name_types[id_col]
      if id_type && %w[INT BIGINT].include?(id_type)
        annotations << "  -- POLYMORPHIC: #{col_name} + #{id_col} (join target depends on type value)"
      end
    end

    # Lookup values annotation
    if lookup_values.key?(table)
      annotations << "  -- VALUES: #{lookup_values[table]}"
    end

    count = row_counts[table]
    count_hint = count ? " (~#{count} rows)" : ""
    lines << "TABLE #{table}#{count_hint} (#{col_parts.join(', ')})"
    annotations.each { |ann| lines << ann }
  end

  @tables = table_names
  @summary_text = lines.join("\n")

  build_per_table_schemas(lines)
  build_table_index(columns_by_table)
  build_fk_graph(fk_rows, columns_by_table, table_name_set)
end

#extract_enum_context(schema_text = nil) ⇒ Object

Extract RAILS ENUM annotations from a schema string for the answer prompt. Returns a string like:

"contractors.status: Active=1, Inactive=2, Deleted=3\njobs.status: Active=1, ..."


150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
# File 'lib/sql_chatbot/services/schema_service.rb', line 150

def extract_enum_context(schema_text = nil)
  source = schema_text || @summary_text
  return "" if source.empty?

  lines = []
  current_table = nil

  source.split("\n").each do |line|
    if line.start_with?("TABLE ")
      current_table = line.match(/^TABLE (\S+)/)[1]
    elsif line.include?("RAILS ENUM:") && current_table
      match = line.match(/RAILS ENUM:\s+(\S+)\s+values:\s+(.+)/)
      next unless match
      lines << "#{current_table}.#{match[1]}: #{match[2]}"
    end
  end

  lines.join("\n")
end

#find_lookup_hints(question) ⇒ Object

Scan FK LOOKUP and RAILS ENUM annotations for values that match words in the question. Returns array of hint strings like:

"The user mentions 'movies'. In the titles table, use WHERE category_id = 2 (Movie)."
"The user mentions 'active'. In the contractors table, use WHERE status = 1 (Active)."


83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# File 'lib/sql_chatbot/services/schema_service.rb', line 83

def find_lookup_hints(question)
  return [] if @summary_text.empty?

  # Filter out stop words that would match too broadly
  stop_words = Set.new(%w[a an the is are was were be been being have has had do does did will would shall should may might can could how what when where who which why not and or but if then else for from by with at in on to of it its this that these those])
  words = question.downcase.split(/\W+/).reject { |w| w.empty? || w.length < 2 || stop_words.include?(w) }
  hints = []
  current_table = nil

  @summary_text.split("\n").each do |line|
    if line.start_with?("TABLE ")
      current_table = line.match(/^TABLE (\S+)/)[1]
    elsif line.include?("FK LOOKUP:") && current_table
      match = line.match(/FK LOOKUP:\s+(\S+).*?values:\s+(.+)/)
      next unless match

      fk_col = match[1]
      pairs = match[2].split(",").map(&:strip)
      pairs.each do |pair|
        id, name = pair.split("=", 2)
        next unless name
        clean_name = name.strip
        next if clean_name.empty? || clean_name.length < 2  # Skip empty/tiny names

        name_words = clean_name.downcase.split(/\W+/).reject(&:empty?)
        matched_word = words.find do |w|
          name_words.include?(w) ||
            clean_name.downcase == w ||
            (clean_name.length >= 3 && clean_name.downcase.start_with?(w)) ||
            (w.length >= 3 && w.start_with?(clean_name.downcase))
        end
        if matched_word
          hints << "The user mentions \"#{matched_word}\". In the #{current_table} table, use WHERE #{fk_col} = #{id.strip} (#{clean_name})."
        end
      end
    elsif line.include?("RAILS ENUM:") && current_table
      match = line.match(/RAILS ENUM:\s+(\S+)\s+values:\s+(.+)/)
      next unless match

      col = match[1]
      pairs = match[2].split(",").map(&:strip)
      pairs.each do |pair|
        label, num = pair.split("=", 2)
        next unless label && num
        clean_label = label.strip
        next if clean_label.empty? || clean_label.length < 2

        label_words = clean_label.downcase.split(/\W+/).reject(&:empty?)
        matched_word = words.find do |w|
          label_words.include?(w) ||
            clean_label.downcase == w ||
            (clean_label.length >= 3 && clean_label.downcase.start_with?(w)) ||
            (w.length >= 3 && w.start_with?(clean_label.downcase))
        end
        if matched_word
          hints << "The user mentions \"#{matched_word}\". In the #{current_table} table, use WHERE #{col} = #{num.strip} (#{clean_label})."
        end
      end
    end
  end

  hints.uniq.first(15)  # Cap at 15 hints to avoid drowning the LLM
end

#refreshObject

Re-discover schema (alias for discover)



410
411
412
# File 'lib/sql_chatbot/services/schema_service.rb', line 410

def refresh
  discover
end

#relocate_lookup_annotationsObject

Move “– VALUES:” annotations from lookup tables to the FK columns that reference them. After this, LLMs see lookup values next to the FK column (e.g., category_id) instead of on the lookup table itself, preventing confusion between unrelated integer columns.



417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
# File 'lib/sql_chatbot/services/schema_service.rb', line 417

def relocate_lookup_annotations
  lines = @summary_text.split("\n")

  # Step 1: Extract VALUES annotations and their tables
  lookup_values = {} # table_name => values_string
  lines_without_values = []
  current_table = nil

  lines.each do |line|
    if line.start_with?("TABLE ")
      current_table = line.match(/^TABLE (\S+)/)[1]
    end

    if line.strip.start_with?("-- VALUES:")
      lookup_values[current_table] = line.strip.sub("-- VALUES: ", "") if current_table
    else
      lines_without_values << line
    end
  end

  return if lookup_values.empty?

  # Step 2: Build convention-based table name patterns for matching
  convention_map = {} # "singular_id" => lookup_table
  lookup_values.each_key do |table|
    singular = if table.end_with?("ies")
                 table[0..-4] + "y"
               elsif table.end_with?("ses")
                 table[0..-3]
               elsif table.end_with?("s")
                 table[0..-2]
               else
                 table
               end
    convention_map["#{singular}_id"] = table
  end

  # Step 3: Find FK columns and inject FK LOOKUP annotations
  result = []
  lines_without_values.each do |line|
    result << line

    if line.start_with?("TABLE ")
      # Match explicit FK references: "column_name INT FK=>target_table.target_column"
      lookup_values.each do |lookup_table, values|
        line.scan(/(\w+)\s+\w+\s+FK=>#{Regexp.escape(lookup_table)}\.(\w+)/).each do |fk_col, _target_col|
          result << "  -- FK LOOKUP: #{fk_col} values: #{values}"
        end
      end

      # Match convention-based references: "category_id INT" (no FK=> marker)
      convention_map.each do |fk_col_name, lookup_table|
        # Skip if already matched by explicit FK above
        next if line.include?("#{fk_col_name} ") && line.include?("FK=>#{lookup_table}")
        if line.match?(/\b#{Regexp.escape(fk_col_name)}\s+\w+(?!\s+FK)/)
          result << "  -- FK LOOKUP: #{fk_col_name} values: #{lookup_values[lookup_table]}"
        end
      end
    end
  end

  @summary_text = result.join("\n")
end

#select_schema(terms) ⇒ Object

Given an array of search terms (e.g. [“customers”] or [“jobs”, “job_types”]), returns a schema string containing ONLY the matching tables plus any tables needed to join them (bridge tables via FK paths, max depth 2).

Falls back to hub tables (top 10 by FK edge count) when no terms match.



184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
# File 'lib/sql_chatbot/services/schema_service.rb', line 184

def select_schema(terms)
  # If indexes are empty (discover not yet called), return full summary
  return @summary_text if @per_table_schemas.empty?

  # Step 1: Score tables by relevance
  scores = score_tables(terms)

  # Step 2: Fallback to hub tables if no matches
  if scores.empty?
    hub_tables(8).each { |t| scores[t] = 1 }
  end

  # Step 3: Take top 8 by score
  max_primary = 8
  top_tables = scores.sort_by { |_, s| -s }.first(max_primary).map(&:first)

  # Step 4: Find FK join paths between top tables
  all_tables = Set.new(top_tables)
  top_tables.combination(2).each do |from, to|
    bridge = find_join_path(from, to)
    all_tables.merge(bridge) unless bridge.nil?
  end

  # Step 5: Cap total at 12
  max_total = 12
  final_tables = if all_tables.size <= max_total
                   all_tables
                 else
                   Set.new((top_tables + all_tables.to_a.reject { |t| top_tables.include?(t) }).first(max_total))
                 end

  # Step 6: Build schema string preserving original order
  @tables.select { |t| final_tables.include?(t) }
         .map { |t| @per_table_schemas[t] }
         .compact
         .join("\n")
end

#summaryObject



71
72
73
# File 'lib/sql_chatbot/services/schema_service.rb', line 71

def summary
  @summary_text
end

#table_namesObject

Returns a short string listing all known table names. Used by the classify prompt so the LLM can see available tables without the full schema (~500 tokens vs ~275K chars for the full schema).



173
174
175
176
177
# File 'lib/sql_chatbot/services/schema_service.rb', line 173

def table_names
  return "" if @tables.empty?

  "Available tables: #{@tables.join(', ')}"
end