Class: Ragnar::Database

Inherits:

Object

Object
Ragnar::Database

show all

Defined in:: lib/ragnar/database.rb

Instance Attribute Summary collapse

#db_path ⇒ Object readonly

Returns the value of attribute db_path.
#table_name ⇒ Object readonly

Returns the value of attribute table_name.

Instance Method Summary collapse

#add_documents(documents) ⇒ Object
#count ⇒ Object
#dataset_exists? ⇒ Boolean
#document_count ⇒ Object

Get the total number of documents in the database.
#full_text_search(query, limit: 10) ⇒ Object
#get_all_documents_with_embeddings(limit: nil) ⇒ Object
#get_documents_by_ids(ids) ⇒ Array<Hash>

Get documents by their IDs.
#get_embeddings(limit: nil, offset: 0) ⇒ Object
#get_stats ⇒ Object
#initialize(db_path, table_name: "documents") ⇒ Database constructor

A new instance of Database.
#search_similar(embedding, k: 10, use_reduced: false) ⇒ Object
#update_reduced_embeddings(updates) ⇒ Object

Constructor Details

#initialize(db_path, table_name: "documents") ⇒ `Database`

Returns a new instance of Database.

# File 'lib/ragnar/database.rb', line 5

def initialize(db_path, table_name: "documents")
  @db_path = db_path
  @table_name = table_name
  @dataset_cache = nil  # Cache to prevent file descriptor leaks
  ensure_database_exists
end

Instance Attribute Details

#db_path ⇒ `Object` (readonly)

Returns the value of attribute db_path.



3
4
5

# File 'lib/ragnar/database.rb', line 3

def db_path
  @db_path
end

#table_name ⇒ `Object` (readonly)

Returns the value of attribute table_name.



3
4
5

# File 'lib/ragnar/database.rb', line 3

def table_name
  @table_name
end

Instance Method Details

#add_documents(documents) ⇒ `Object`

# File 'lib/ragnar/database.rb', line 12

def add_documents(documents)
  return if documents.empty?
  
  # Convert documents to Lance-compatible format
  data = documents.map do |doc|
    {
      id: doc[:id],
      chunk_text: doc[:chunk_text],
      file_path: doc[:file_path],
      chunk_index: doc[:chunk_index],
      embedding: doc[:embedding],
      metadata: doc[:metadata].to_json
    }
  end
  
  # Define schema for the table with vector type
  embedding_size = documents.first[:embedding].size
  schema = {
    id: :string,
    chunk_text: :string,
    file_path: :string,
    chunk_index: :int64,
    embedding: { type: "vector", dimension: embedding_size },
    metadata: :string
  }
  
  # Clear cache before modifying dataset
  clear_dataset_cache
  
  # Use the new open_or_create method from Lancelot
  # This automatically handles both creating new and opening existing datasets
  dataset = Lancelot::Dataset.open_or_create(@db_path, schema: schema)
  dataset.add_documents(data)
  
  # Clear cache after modification to ensure fresh data on next read
  clear_dataset_cache
end

#count ⇒ `Object`

# File 'lib/ragnar/database.rb', line 165

def count
  return 0 unless dataset_exists?
  
  dataset = cached_dataset
  return 0 unless dataset
  
  dataset.to_a.size
end

#dataset_exists? ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/ragnar/database.rb', line 309

def dataset_exists?
  return false unless File.exist?(@db_path)
  
  # Try to use cached dataset if available
  if @dataset_cache
    return true
  end
  
  # Otherwise check if we can open it
  begin
    # Don't cache here, just check existence
    dataset = Lancelot::Dataset.open(@db_path)
    true
  rescue
    false
  end
end

#document_count ⇒ `Object`

Get the total number of documents in the database



279
280
281

# File 'lib/ragnar/database.rb', line 279

def document_count
  count
end

#full_text_search(query, limit: 10) ⇒ `Object`

# File 'lib/ragnar/database.rb', line 254

def full_text_search(query, limit: 10)
  return [] unless dataset_exists?
  
  dataset = cached_dataset
  return [] unless dataset
  
  # Use Lancelot's full-text search
  results = dataset.full_text_search(
    query,
    columns: [:chunk_text],
    limit: limit
  )
  
  results.map do |row|
    {
      id: row[:id],
      chunk_text: row[:chunk_text],
      file_path: row[:file_path],
      chunk_index: row[:chunk_index],
      metadata: JSON.parse(row[:metadata] || "{}")
    }
  end
end

#get_all_documents_with_embeddings(limit: nil) ⇒ `Object`

# File 'lib/ragnar/database.rb', line 243

def get_all_documents_with_embeddings(limit: nil)
  return [] unless dataset_exists?
  
  dataset = cached_dataset
  return [] unless dataset
  
  all_docs = limit ? dataset.first(limit) : dataset.to_a
  
  all_docs.select { |doc| doc[:embedding] && !doc[:embedding].empty? }
end

#get_documents_by_ids(ids) ⇒ `Array<Hash>`

Get documents by their IDs

Parameters:

ids (Array<String>) —

Document IDs to fetch

Returns:

(Array<Hash>) —

Documents with their embeddings

# File 'lib/ragnar/database.rb', line 286

def get_documents_by_ids(ids)
  return [] if ids.empty? || !dataset_exists?
  
  dataset = cached_dataset
  return [] unless dataset
  
  # Create ID lookup set for efficiency
  id_set = ids.to_set
  
  # Filter documents by IDs
  dataset.to_a.select { |doc| id_set.include?(doc[:id]) }.map do |doc|
    {
      id: doc[:id],
      chunk_text: doc[:chunk_text],
      file_path: doc[:file_path],
      chunk_index: doc[:chunk_index],
      embedding: doc[:embedding],
      reduced_embedding: doc[:reduced_embedding],
      metadata: JSON.parse(doc[:metadata] || "{}")
    }
  end
end

#get_embeddings(limit: nil, offset: 0) ⇒ `Object`

# File 'lib/ragnar/database.rb', line 50

def get_embeddings(limit: nil, offset: 0)
  return [] unless dataset_exists?
  
  dataset = cached_dataset
  return [] unless dataset
  
  # Get all documents or a subset
  docs = if limit && offset > 0
    # Get limit + offset items, then drop offset
    dataset.first(limit + offset).drop(offset)
  elsif limit
    dataset.first(limit)
  else
    dataset.to_a.drop(offset)
  end
  
  docs.map do |doc|
    {
      id: doc[:id],
      embedding: doc[:embedding],
      reduced_embedding: doc[:reduced_embedding]
    }
  end
end

#get_stats ⇒ `Object`

# File 'lib/ragnar/database.rb', line 174

def get_stats
  unless dataset_exists?
    return {
      document_count: 0,
      total_documents: 0,
      unique_files: 0,
      total_chunks: 0,
      with_embeddings: 0,
      with_reduced_embeddings: 0,
      total_size_mb: 0.0
    }
  end
  
  dataset = cached_dataset
  unless dataset
    return {
      document_count: 0,
      total_documents: 0,
      unique_files: 0,
      total_chunks: 0,
      with_embeddings: 0,
      with_reduced_embeddings: 0,
      total_size_mb: 0.0
    }
  end
  
  # Get all documents
  all_docs = dataset.to_a
  
  stats = {
    document_count: all_docs.size,  # Add for compatibility with specs
    total_documents: all_docs.size,
    total_chunks: all_docs.size,
    unique_files: all_docs.map { |d| d[:file_path] }.uniq.size,
    with_embeddings: 0,
    with_reduced_embeddings: 0,
    avg_chunk_size: 0,
    total_size_mb: 0,  # Add for CLI stats command
    embedding_dims: nil,
    reduced_dims: nil
  }
  
  chunk_sizes = []
  total_bytes = 0
  
  all_docs.each do |doc|
    if doc[:embedding] && !doc[:embedding].empty?
      stats[:with_embeddings] += 1
      stats[:embedding_dims] ||= doc[:embedding].size
    end
    
    if doc[:reduced_embedding] && !doc[:reduced_embedding].empty?
      stats[:with_reduced_embeddings] += 1
      stats[:reduced_dims] ||= doc[:reduced_embedding].size
    end
    
    if doc[:chunk_text]
      chunk_size = doc[:chunk_text].size
      chunk_sizes << chunk_size
      total_bytes += chunk_size
    end
  end
  
  stats[:avg_chunk_size] = (chunk_sizes.sum.to_f / chunk_sizes.size).round if chunk_sizes.any?
  stats[:total_size_mb] = (total_bytes / 1024.0 / 1024.0).round(2)
  
  stats
end

#search_similar(embedding, k: 10, use_reduced: false) ⇒ `Object`

# File 'lib/ragnar/database.rb', line 138

def search_similar(embedding, k: 10, use_reduced: false)
  return [] unless dataset_exists?
  
  dataset = cached_dataset
  return [] unless dataset
  
  embedding_field = use_reduced ? :reduced_embedding : :embedding
  
  # Perform vector search
  results = dataset.vector_search(
    embedding.to_a, 
    column: embedding_field,
    limit: k
  )
  
  results.map do |row|
    {
      id: row[:id],
      chunk_text: row[:chunk_text],
      file_path: row[:file_path],
      chunk_index: row[:chunk_index],
      distance: row[:_distance],
      metadata: JSON.parse(row[:metadata] || "{}")
    }
  end
end

#update_reduced_embeddings(updates) ⇒ `Object`

# File 'lib/ragnar/database.rb', line 75

def update_reduced_embeddings(updates)
  return if updates.empty?
  
  dataset = cached_dataset
  return unless dataset
  
  # Get all existing documents and safely extract their data
  all_docs = dataset.to_a.map do |doc|
    # Safely extract fields we know about
    {
      id: doc[:id],
      content: doc[:content],
      chunk_text: doc[:chunk_text],
      file_path: doc[:file_path],
      chunk_index: doc[:chunk_index],
      embedding: doc[:embedding],
      metadata: doc[:metadata],
      reduced_embedding: doc[:reduced_embedding]
    }
  end
  
  # Create a map for quick lookup
  update_map = updates.each_with_object({}) do |update, map|
    map[update[:id]] = update[:reduced_embedding]
  end
  
  # Update documents with reduced embeddings
  updated_docs = all_docs.map do |doc|
    if update_map[doc[:id]]
      doc.merge(reduced_embedding: update_map[doc[:id]])
    else
      doc
    end
  end
  
  # Need to recreate the dataset with updated data
  # First, backup the schema including the new reduced_embedding field
  embedding_size = all_docs.first[:embedding].size
  reduced_size = updates.first[:reduced_embedding].size
  
  schema = {
    id: :string,
    chunk_text: :string,
    file_path: :string,
    chunk_index: :int64,
    embedding: { type: "vector", dimension: embedding_size },
    reduced_embedding: { type: "vector", dimension: reduced_size },
    metadata: :string
  }
  
  # Clear cache before recreating dataset
  clear_dataset_cache
  
  # Remove old dataset and create new one with updated data
  FileUtils.rm_rf(@db_path)
  # Use open_or_create which will create since we just deleted the path
  dataset = Lancelot::Dataset.open_or_create(@db_path, schema: schema)
  dataset.add_documents(updated_docs)
  
  # Clear cache after modification
  clear_dataset_cache
end

Class: Ragnar::Database

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(db_path, table_name: "documents") ⇒ Database

Instance Attribute Details

#db_path ⇒ Object (readonly)

#table_name ⇒ Object (readonly)

Instance Method Details

#add_documents(documents) ⇒ Object

#count ⇒ Object

#dataset_exists? ⇒ Boolean

#document_count ⇒ Object

#full_text_search(query, limit: 10) ⇒ Object

#get_all_documents_with_embeddings(limit: nil) ⇒ Object

#get_documents_by_ids(ids) ⇒ Array<Hash>

#get_embeddings(limit: nil, offset: 0) ⇒ Object

#get_stats ⇒ Object

#search_similar(embedding, k: 10, use_reduced: false) ⇒ Object

#update_reduced_embeddings(updates) ⇒ Object