Class: Lancelot::Dataset

Inherits:
Object
  • Object
show all
Includes:
Enumerable
Defined in:
lib/lancelot/dataset.rb

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.create(path, schema:) ⇒ Object



8
9
10
11
12
# File 'lib/lancelot/dataset.rb', line 8

def create(path, schema:)
  dataset = new(path)
  dataset.create(normalize_schema(schema))
  dataset
end

.open(path) ⇒ Object



14
15
16
17
18
# File 'lib/lancelot/dataset.rb', line 14

def open(path)
  dataset = new(path)
  dataset.open
  dataset
end

.open_or_create(path, schema:, mode: nil) ⇒ Object



20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# File 'lib/lancelot/dataset.rb', line 20

def open_or_create(path, schema:, mode: nil)
  # Check if path exists
  if File.exist?(path)
    # Check if it's a file instead of directory
    if File.file?(path)
      if mode == "overwrite"
        # Remove the file and create dataset
        FileUtils.rm_f(path)
        create(path, schema: schema)
      else
        raise ArgumentError, "Path #{path} exists as a file, not a directory. " \
                            "Use mode: 'overwrite' to replace it, or choose a different path."
      end
    # Path exists as directory - check if it's a valid Lance dataset
    elsif File.exist?(File.join(path, "_versions"))
      # Valid dataset exists - open it
      open(path)
    elsif !Dir.empty?(path)
      # Non-empty directory that's not a Lance dataset
      if mode == "overwrite"
        # User explicitly wants to overwrite - remove and create new
        FileUtils.rm_rf(path)
        create(path, schema: schema)
      else
        # Fail safely - don't overwrite existing non-dataset directory
        raise ArgumentError, "Directory exists at #{path} but is not a valid Lance dataset. " \
                            "Use mode: 'overwrite' to replace it, or choose a different path."
      end
    else
      # Empty directory - safe to create dataset
      create(path, schema: schema)
    end
  else
    # Path doesn't exist - create new dataset
    create(path, schema: schema)
  end
end

Instance Method Details

#<<(document) ⇒ Object



88
89
90
91
# File 'lib/lancelot/dataset.rb', line 88

def <<(document)
  add_documents([document])
  self
end

#==(other) ⇒ Object Also known as: eql?



198
199
200
# File 'lib/lancelot/dataset.rb', line 198

def ==(other)
  other.is_a?(Dataset) && other.path == path
end

#add_documents(documents) ⇒ Object



84
85
86
# File 'lib/lancelot/dataset.rb', line 84

def add_documents(documents)
  add_data(documents.map { |doc| normalize_document(doc) })
end

#allObject



108
109
110
# File 'lib/lancelot/dataset.rb', line 108

def all
  scan_all
end

#count(&block) ⇒ Object

Override Enumerable’s count to use our efficient count_rows when no block given



100
101
102
103
104
105
106
# File 'lib/lancelot/dataset.rb', line 100

def count(&block)
  if block_given?
    super(&block)  # Use Enumerable's count with block
  else
    count_rows  # Use our efficient count without block
  end
end

#each(&block) ⇒ Object



120
121
122
123
# File 'lib/lancelot/dataset.rb', line 120

def each(&block)
  return enum_for(:each) unless block_given?
  scan_all.each(&block)
end

#first(n = nil) ⇒ Object



112
113
114
115
116
117
118
# File 'lib/lancelot/dataset.rb', line 112

def first(n = nil)
  if n.nil?
    scan_limit(1).first
  else
    scan_limit(n)
  end
end

#hashObject



203
204
205
# File 'lib/lancelot/dataset.rb', line 203

def hash
  path.hash
end

#hybrid_search(query, vector_column: "vector", text_column: nil, text_columns: nil, vector: nil, limit: 10, rrf_k: 60) ⇒ Object



157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
# File 'lib/lancelot/dataset.rb', line 157

def hybrid_search(query, vector_column: "vector", text_column: nil, text_columns: nil, 
                  vector: nil, limit: 10, rrf_k: 60)
  require 'lancelot/rank_fusion'
  
  result_lists = []
  
  # Perform vector search if vector is provided
  if vector
    unless vector.is_a?(Array)
      raise ArgumentError, "Vector must be an array of numbers"
    end
    
    vector_results = vector_search(vector, column: vector_column, limit: limit * 2)
    result_lists << vector_results if vector_results.any?
  end
  
  # Perform text search if query is provided
  if query && !query.empty?
    text_results = text_search(query, column: text_column, columns: text_columns, limit: limit * 2)
    result_lists << text_results if text_results.any?
  end
  
  # Return empty array if no searches were performed
  return [] if result_lists.empty?
  
  # Return single result list if only one search was performed
  return result_lists.first[0...limit] if result_lists.size == 1
  
  # Perform RRF fusion and limit results
  Lancelot::RankFusion.reciprocal_rank_fusion(result_lists, k: rrf_k)[0...limit]
end

#nearest_neighbors(vector, k: 10, column: "vector") ⇒ Object



135
136
137
# File 'lib/lancelot/dataset.rb', line 135

def nearest_neighbors(vector, k: 10, column: "vector")
  vector_search(vector, column: column, limit: k)
end

#sizeObject Also known as: length



93
94
95
# File 'lib/lancelot/dataset.rb', line 93

def size
  count_rows
end

#text_search(query, column: nil, columns: nil, limit: 10) ⇒ Object



139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# File 'lib/lancelot/dataset.rb', line 139

def text_search(query, column: nil, columns: nil, limit: 10)
  unless query.is_a?(String)
    raise ArgumentError, "Query must be a string"
  end
  
  if column && columns
    raise ArgumentError, "Cannot specify both column and columns"
  elsif columns
    # Multi-column search
    columns = Array(columns).map(&:to_s)
    _rust_multi_column_text_search(columns, query, limit)
  else
    # Single column search (default to "text" if not specified)
    column ||= "text"
    _rust_text_search(column.to_s, query, limit)
  end
end

#to_sObject Also known as: inspect



193
194
195
# File 'lib/lancelot/dataset.rb', line 193

def to_s
  "#<Lancelot::Dataset path=\"#{path}\" count=#{count}>"
end

#vector_search(query_vector, column: "vector", limit: 10) ⇒ Object



127
128
129
130
131
132
133
# File 'lib/lancelot/dataset.rb', line 127

def vector_search(query_vector, column: "vector", limit: 10)
  unless query_vector.is_a?(Array)
    raise ArgumentError, "Query vector must be an array of numbers"
  end
  
  _rust_vector_search(column.to_s, query_vector, limit)
end

#where(filter_expression, limit: nil) ⇒ Object



189
190
191
# File 'lib/lancelot/dataset.rb', line 189

def where(filter_expression, limit: nil)
  filter_scan(filter_expression.to_s, limit)
end