Class: Ragnar::Database

Inherits:
Object
  • Object
show all
Defined in:
lib/ragnar/database.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(db_path, table_name: "documents") ⇒ Database

Returns a new instance of Database.



5
6
7
8
9
10
# File 'lib/ragnar/database.rb', line 5

def initialize(db_path, table_name: "documents")
  @db_path = db_path
  @table_name = table_name
  @dataset_cache = nil  # Cache to prevent file descriptor leaks
  ensure_database_exists
end

Instance Attribute Details

#db_pathObject (readonly)

Returns the value of attribute db_path.



3
4
5
# File 'lib/ragnar/database.rb', line 3

def db_path
  @db_path
end

#table_nameObject (readonly)

Returns the value of attribute table_name.



3
4
5
# File 'lib/ragnar/database.rb', line 3

def table_name
  @table_name
end

Instance Method Details

#add_documents(documents) ⇒ Object



12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# File 'lib/ragnar/database.rb', line 12

def add_documents(documents)
  return if documents.empty?
  
  # Convert documents to Lance-compatible format
  data = documents.map do |doc|
    {
      id: doc[:id],
      chunk_text: doc[:chunk_text],
      file_path: doc[:file_path],
      chunk_index: doc[:chunk_index],
      embedding: doc[:embedding],
      metadata: doc[:metadata].to_json
    }
  end
  
  # Define schema for the table with vector type
  embedding_size = documents.first[:embedding].size
  schema = {
    id: :string,
    chunk_text: :string,
    file_path: :string,
    chunk_index: :int64,
    embedding: { type: "vector", dimension: embedding_size },
    metadata: :string
  }
  
  # Clear cache before modifying dataset
  clear_dataset_cache
  
  # Use the new open_or_create method from Lancelot
  # This automatically handles both creating new and opening existing datasets
  dataset = Lancelot::Dataset.open_or_create(@db_path, schema: schema)
  dataset.add_documents(data)
  
  # Clear cache after modification to ensure fresh data on next read
  clear_dataset_cache
end

#countObject



165
166
167
168
169
170
171
172
# File 'lib/ragnar/database.rb', line 165

def count
  return 0 unless dataset_exists?
  
  dataset = cached_dataset
  return 0 unless dataset
  
  dataset.to_a.size
end

#dataset_exists?Boolean

Returns:

  • (Boolean)


309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
# File 'lib/ragnar/database.rb', line 309

def dataset_exists?
  return false unless File.exist?(@db_path)
  
  # Try to use cached dataset if available
  if @dataset_cache
    return true
  end
  
  # Otherwise check if we can open it
  begin
    # Don't cache here, just check existence
    dataset = Lancelot::Dataset.open(@db_path)
    true
  rescue
    false
  end
end

#document_countObject

Get the total number of documents in the database



279
280
281
# File 'lib/ragnar/database.rb', line 279

def document_count
  count
end

#full_text_search(query, limit: 10) ⇒ Object



254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
# File 'lib/ragnar/database.rb', line 254

def full_text_search(query, limit: 10)
  return [] unless dataset_exists?
  
  dataset = cached_dataset
  return [] unless dataset
  
  # Use Lancelot's full-text search
  results = dataset.full_text_search(
    query,
    columns: [:chunk_text],
    limit: limit
  )
  
  results.map do |row|
    {
      id: row[:id],
      chunk_text: row[:chunk_text],
      file_path: row[:file_path],
      chunk_index: row[:chunk_index],
      metadata: JSON.parse(row[:metadata] || "{}")
    }
  end
end

#get_all_documents_with_embeddings(limit: nil) ⇒ Object



243
244
245
246
247
248
249
250
251
252
# File 'lib/ragnar/database.rb', line 243

def get_all_documents_with_embeddings(limit: nil)
  return [] unless dataset_exists?
  
  dataset = cached_dataset
  return [] unless dataset
  
  all_docs = limit ? dataset.first(limit) : dataset.to_a
  
  all_docs.select { |doc| doc[:embedding] && !doc[:embedding].empty? }
end

#get_documents_by_ids(ids) ⇒ Array<Hash>

Get documents by their IDs

Parameters:

  • ids (Array<String>)

    Document IDs to fetch

Returns:

  • (Array<Hash>)

    Documents with their embeddings



286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
# File 'lib/ragnar/database.rb', line 286

def get_documents_by_ids(ids)
  return [] if ids.empty? || !dataset_exists?
  
  dataset = cached_dataset
  return [] unless dataset
  
  # Create ID lookup set for efficiency
  id_set = ids.to_set
  
  # Filter documents by IDs
  dataset.to_a.select { |doc| id_set.include?(doc[:id]) }.map do |doc|
    {
      id: doc[:id],
      chunk_text: doc[:chunk_text],
      file_path: doc[:file_path],
      chunk_index: doc[:chunk_index],
      embedding: doc[:embedding],
      reduced_embedding: doc[:reduced_embedding],
      metadata: JSON.parse(doc[:metadata] || "{}")
    }
  end
end

#get_embeddings(limit: nil, offset: 0) ⇒ Object



50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# File 'lib/ragnar/database.rb', line 50

def get_embeddings(limit: nil, offset: 0)
  return [] unless dataset_exists?
  
  dataset = cached_dataset
  return [] unless dataset
  
  # Get all documents or a subset
  docs = if limit && offset > 0
    # Get limit + offset items, then drop offset
    dataset.first(limit + offset).drop(offset)
  elsif limit
    dataset.first(limit)
  else
    dataset.to_a.drop(offset)
  end
  
  docs.map do |doc|
    {
      id: doc[:id],
      embedding: doc[:embedding],
      reduced_embedding: doc[:reduced_embedding]
    }
  end
end

#get_statsObject



174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
# File 'lib/ragnar/database.rb', line 174

def get_stats
  unless dataset_exists?
    return {
      document_count: 0,
      total_documents: 0,
      unique_files: 0,
      total_chunks: 0,
      with_embeddings: 0,
      with_reduced_embeddings: 0,
      total_size_mb: 0.0
    }
  end
  
  dataset = cached_dataset
  unless dataset
    return {
      document_count: 0,
      total_documents: 0,
      unique_files: 0,
      total_chunks: 0,
      with_embeddings: 0,
      with_reduced_embeddings: 0,
      total_size_mb: 0.0
    }
  end
  
  # Get all documents
  all_docs = dataset.to_a
  
  stats = {
    document_count: all_docs.size,  # Add for compatibility with specs
    total_documents: all_docs.size,
    total_chunks: all_docs.size,
    unique_files: all_docs.map { |d| d[:file_path] }.uniq.size,
    with_embeddings: 0,
    with_reduced_embeddings: 0,
    avg_chunk_size: 0,
    total_size_mb: 0,  # Add for CLI stats command
    embedding_dims: nil,
    reduced_dims: nil
  }
  
  chunk_sizes = []
  total_bytes = 0
  
  all_docs.each do |doc|
    if doc[:embedding] && !doc[:embedding].empty?
      stats[:with_embeddings] += 1
      stats[:embedding_dims] ||= doc[:embedding].size
    end
    
    if doc[:reduced_embedding] && !doc[:reduced_embedding].empty?
      stats[:with_reduced_embeddings] += 1
      stats[:reduced_dims] ||= doc[:reduced_embedding].size
    end
    
    if doc[:chunk_text]
      chunk_size = doc[:chunk_text].size
      chunk_sizes << chunk_size
      total_bytes += chunk_size
    end
  end
  
  stats[:avg_chunk_size] = (chunk_sizes.sum.to_f / chunk_sizes.size).round if chunk_sizes.any?
  stats[:total_size_mb] = (total_bytes / 1024.0 / 1024.0).round(2)
  
  stats
end

#search_similar(embedding, k: 10, use_reduced: false) ⇒ Object



138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
# File 'lib/ragnar/database.rb', line 138

def search_similar(embedding, k: 10, use_reduced: false)
  return [] unless dataset_exists?
  
  dataset = cached_dataset
  return [] unless dataset
  
  embedding_field = use_reduced ? :reduced_embedding : :embedding
  
  # Perform vector search
  results = dataset.vector_search(
    embedding.to_a, 
    column: embedding_field,
    limit: k
  )
  
  results.map do |row|
    {
      id: row[:id],
      chunk_text: row[:chunk_text],
      file_path: row[:file_path],
      chunk_index: row[:chunk_index],
      distance: row[:_distance],
      metadata: JSON.parse(row[:metadata] || "{}")
    }
  end
end

#update_reduced_embeddings(updates) ⇒ Object



75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# File 'lib/ragnar/database.rb', line 75

def update_reduced_embeddings(updates)
  return if updates.empty?
  
  dataset = cached_dataset
  return unless dataset
  
  # Get all existing documents and safely extract their data
  all_docs = dataset.to_a.map do |doc|
    # Safely extract fields we know about
    {
      id: doc[:id],
      content: doc[:content],
      chunk_text: doc[:chunk_text],
      file_path: doc[:file_path],
      chunk_index: doc[:chunk_index],
      embedding: doc[:embedding],
      metadata: doc[:metadata],
      reduced_embedding: doc[:reduced_embedding]
    }
  end
  
  # Create a map for quick lookup
  update_map = updates.each_with_object({}) do |update, map|
    map[update[:id]] = update[:reduced_embedding]
  end
  
  # Update documents with reduced embeddings
  updated_docs = all_docs.map do |doc|
    if update_map[doc[:id]]
      doc.merge(reduced_embedding: update_map[doc[:id]])
    else
      doc
    end
  end
  
  # Need to recreate the dataset with updated data
  # First, backup the schema including the new reduced_embedding field
  embedding_size = all_docs.first[:embedding].size
  reduced_size = updates.first[:reduced_embedding].size
  
  schema = {
    id: :string,
    chunk_text: :string,
    file_path: :string,
    chunk_index: :int64,
    embedding: { type: "vector", dimension: embedding_size },
    reduced_embedding: { type: "vector", dimension: reduced_size },
    metadata: :string
  }
  
  # Clear cache before recreating dataset
  clear_dataset_cache
  
  # Remove old dataset and create new one with updated data
  FileUtils.rm_rf(@db_path)
  # Use open_or_create which will create since we just deleted the path
  dataset = Lancelot::Dataset.open_or_create(@db_path, schema: schema)
  dataset.add_documents(updated_docs)
  
  # Clear cache after modification
  clear_dataset_cache
end