Class: Ragnar::Database
- Inherits:
-
Object
- Object
- Ragnar::Database
- Defined in:
- lib/ragnar/database.rb
Instance Attribute Summary collapse
-
#db_path ⇒ Object
readonly
Returns the value of attribute db_path.
-
#table_name ⇒ Object
readonly
Returns the value of attribute table_name.
Instance Method Summary collapse
- #add_documents(documents) ⇒ Object
- #count ⇒ Object
- #dataset_exists? ⇒ Boolean
-
#document_count ⇒ Object
Get the total number of documents in the database.
- #full_text_search(query, limit: 10) ⇒ Object
- #get_all_documents_with_embeddings(limit: nil) ⇒ Object
-
#get_documents_by_ids(ids) ⇒ Array<Hash>
Get documents by their IDs.
- #get_embeddings(limit: nil, offset: 0) ⇒ Object
- #get_stats ⇒ Object
-
#initialize(db_path, table_name: "documents") ⇒ Database
constructor
A new instance of Database.
- #search_similar(embedding, k: 10, use_reduced: false) ⇒ Object
- #update_reduced_embeddings(updates) ⇒ Object
Constructor Details
#initialize(db_path, table_name: "documents") ⇒ Database
Returns a new instance of Database.
5 6 7 8 9 10 |
# File 'lib/ragnar/database.rb', line 5 def initialize(db_path, table_name: "documents") @db_path = db_path @table_name = table_name @dataset_cache = nil # Cache to prevent file descriptor leaks ensure_database_exists end |
Instance Attribute Details
#db_path ⇒ Object (readonly)
Returns the value of attribute db_path.
3 4 5 |
# File 'lib/ragnar/database.rb', line 3 def db_path @db_path end |
#table_name ⇒ Object (readonly)
Returns the value of attribute table_name.
3 4 5 |
# File 'lib/ragnar/database.rb', line 3 def table_name @table_name end |
Instance Method Details
#add_documents(documents) ⇒ Object
12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
# File 'lib/ragnar/database.rb', line 12 def add_documents(documents) return if documents.empty? # Convert documents to Lance-compatible format data = documents.map do |doc| { id: doc[:id], chunk_text: doc[:chunk_text], file_path: doc[:file_path], chunk_index: doc[:chunk_index], embedding: doc[:embedding], metadata: doc[:metadata].to_json } end # Define schema for the table with vector type = documents.first[:embedding].size schema = { id: :string, chunk_text: :string, file_path: :string, chunk_index: :int64, embedding: { type: "vector", dimension: }, metadata: :string } # Clear cache before modifying dataset clear_dataset_cache # Use the new open_or_create method from Lancelot # This automatically handles both creating new and opening existing datasets dataset = Lancelot::Dataset.open_or_create(@db_path, schema: schema) dataset.add_documents(data) # Clear cache after modification to ensure fresh data on next read clear_dataset_cache end |
#count ⇒ Object
165 166 167 168 169 170 171 172 |
# File 'lib/ragnar/database.rb', line 165 def count return 0 unless dataset_exists? dataset = cached_dataset return 0 unless dataset dataset.to_a.size end |
#dataset_exists? ⇒ Boolean
309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 |
# File 'lib/ragnar/database.rb', line 309 def dataset_exists? return false unless File.exist?(@db_path) # Try to use cached dataset if available if @dataset_cache return true end # Otherwise check if we can open it begin # Don't cache here, just check existence dataset = Lancelot::Dataset.open(@db_path) true rescue false end end |
#document_count ⇒ Object
Get the total number of documents in the database
279 280 281 |
# File 'lib/ragnar/database.rb', line 279 def document_count count end |
#full_text_search(query, limit: 10) ⇒ Object
254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 |
# File 'lib/ragnar/database.rb', line 254 def full_text_search(query, limit: 10) return [] unless dataset_exists? dataset = cached_dataset return [] unless dataset # Use Lancelot's full-text search results = dataset.full_text_search( query, columns: [:chunk_text], limit: limit ) results.map do |row| { id: row[:id], chunk_text: row[:chunk_text], file_path: row[:file_path], chunk_index: row[:chunk_index], metadata: JSON.parse(row[:metadata] || "{}") } end end |
#get_all_documents_with_embeddings(limit: nil) ⇒ Object
243 244 245 246 247 248 249 250 251 252 |
# File 'lib/ragnar/database.rb', line 243 def (limit: nil) return [] unless dataset_exists? dataset = cached_dataset return [] unless dataset all_docs = limit ? dataset.first(limit) : dataset.to_a all_docs.select { |doc| doc[:embedding] && !doc[:embedding].empty? } end |
#get_documents_by_ids(ids) ⇒ Array<Hash>
Get documents by their IDs
286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 |
# File 'lib/ragnar/database.rb', line 286 def get_documents_by_ids(ids) return [] if ids.empty? || !dataset_exists? dataset = cached_dataset return [] unless dataset # Create ID lookup set for efficiency id_set = ids.to_set # Filter documents by IDs dataset.to_a.select { |doc| id_set.include?(doc[:id]) }.map do |doc| { id: doc[:id], chunk_text: doc[:chunk_text], file_path: doc[:file_path], chunk_index: doc[:chunk_index], embedding: doc[:embedding], reduced_embedding: doc[:reduced_embedding], metadata: JSON.parse(doc[:metadata] || "{}") } end end |
#get_embeddings(limit: nil, offset: 0) ⇒ Object
50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 |
# File 'lib/ragnar/database.rb', line 50 def (limit: nil, offset: 0) return [] unless dataset_exists? dataset = cached_dataset return [] unless dataset # Get all documents or a subset docs = if limit && offset > 0 # Get limit + offset items, then drop offset dataset.first(limit + offset).drop(offset) elsif limit dataset.first(limit) else dataset.to_a.drop(offset) end docs.map do |doc| { id: doc[:id], embedding: doc[:embedding], reduced_embedding: doc[:reduced_embedding] } end end |
#get_stats ⇒ Object
174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 |
# File 'lib/ragnar/database.rb', line 174 def get_stats unless dataset_exists? return { document_count: 0, total_documents: 0, unique_files: 0, total_chunks: 0, with_embeddings: 0, with_reduced_embeddings: 0, total_size_mb: 0.0 } end dataset = cached_dataset unless dataset return { document_count: 0, total_documents: 0, unique_files: 0, total_chunks: 0, with_embeddings: 0, with_reduced_embeddings: 0, total_size_mb: 0.0 } end # Get all documents all_docs = dataset.to_a stats = { document_count: all_docs.size, # Add for compatibility with specs total_documents: all_docs.size, total_chunks: all_docs.size, unique_files: all_docs.map { |d| d[:file_path] }.uniq.size, with_embeddings: 0, with_reduced_embeddings: 0, avg_chunk_size: 0, total_size_mb: 0, # Add for CLI stats command embedding_dims: nil, reduced_dims: nil } chunk_sizes = [] total_bytes = 0 all_docs.each do |doc| if doc[:embedding] && !doc[:embedding].empty? stats[:with_embeddings] += 1 stats[:embedding_dims] ||= doc[:embedding].size end if doc[:reduced_embedding] && !doc[:reduced_embedding].empty? stats[:with_reduced_embeddings] += 1 stats[:reduced_dims] ||= doc[:reduced_embedding].size end if doc[:chunk_text] chunk_size = doc[:chunk_text].size chunk_sizes << chunk_size total_bytes += chunk_size end end stats[:avg_chunk_size] = (chunk_sizes.sum.to_f / chunk_sizes.size).round if chunk_sizes.any? stats[:total_size_mb] = (total_bytes / 1024.0 / 1024.0).round(2) stats end |
#search_similar(embedding, k: 10, use_reduced: false) ⇒ Object
138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
# File 'lib/ragnar/database.rb', line 138 def search_similar(, k: 10, use_reduced: false) return [] unless dataset_exists? dataset = cached_dataset return [] unless dataset = use_reduced ? :reduced_embedding : :embedding # Perform vector search results = dataset.vector_search( .to_a, column: , limit: k ) results.map do |row| { id: row[:id], chunk_text: row[:chunk_text], file_path: row[:file_path], chunk_index: row[:chunk_index], distance: row[:_distance], metadata: JSON.parse(row[:metadata] || "{}") } end end |
#update_reduced_embeddings(updates) ⇒ Object
75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
# File 'lib/ragnar/database.rb', line 75 def (updates) return if updates.empty? dataset = cached_dataset return unless dataset # Get all existing documents and safely extract their data all_docs = dataset.to_a.map do |doc| # Safely extract fields we know about { id: doc[:id], content: doc[:content], chunk_text: doc[:chunk_text], file_path: doc[:file_path], chunk_index: doc[:chunk_index], embedding: doc[:embedding], metadata: doc[:metadata], reduced_embedding: doc[:reduced_embedding] } end # Create a map for quick lookup update_map = updates.each_with_object({}) do |update, map| map[update[:id]] = update[:reduced_embedding] end # Update documents with reduced embeddings updated_docs = all_docs.map do |doc| if update_map[doc[:id]] doc.merge(reduced_embedding: update_map[doc[:id]]) else doc end end # Need to recreate the dataset with updated data # First, backup the schema including the new reduced_embedding field = all_docs.first[:embedding].size reduced_size = updates.first[:reduced_embedding].size schema = { id: :string, chunk_text: :string, file_path: :string, chunk_index: :int64, embedding: { type: "vector", dimension: }, reduced_embedding: { type: "vector", dimension: reduced_size }, metadata: :string } # Clear cache before recreating dataset clear_dataset_cache # Remove old dataset and create new one with updated data FileUtils.rm_rf(@db_path) # Use open_or_create which will create since we just deleted the path dataset = Lancelot::Dataset.open_or_create(@db_path, schema: schema) dataset.add_documents(updated_docs) # Clear cache after modification clear_dataset_cache end |