Class: Lancelot::Dataset
- Inherits:
-
Object
- Object
- Lancelot::Dataset
- Includes:
- Enumerable
- Defined in:
- lib/lancelot/dataset.rb
Class Method Summary collapse
- .create(path, schema:) ⇒ Object
- .open(path) ⇒ Object
- .open_or_create(path, schema:, mode: nil) ⇒ Object
Instance Method Summary collapse
- #<<(document) ⇒ Object
- #==(other) ⇒ Object (also: #eql?)
- #add_documents(documents) ⇒ Object
- #all ⇒ Object
-
#count(&block) ⇒ Object
Override Enumerable’s count to use our efficient count_rows when no block given.
- #each(&block) ⇒ Object
- #first(n = nil) ⇒ Object
- #hash ⇒ Object
- #hybrid_search(query, vector_column: "vector", text_column: nil, text_columns: nil, vector: nil, limit: 10, rrf_k: 60) ⇒ Object
- #nearest_neighbors(vector, k: 10, column: "vector") ⇒ Object
- #size ⇒ Object (also: #length)
- #text_search(query, column: nil, columns: nil, limit: 10) ⇒ Object
- #to_s ⇒ Object (also: #inspect)
- #vector_search(query_vector, column: "vector", limit: 10) ⇒ Object
- #where(filter_expression, limit: nil) ⇒ Object
Class Method Details
.create(path, schema:) ⇒ Object
8 9 10 11 12 |
# File 'lib/lancelot/dataset.rb', line 8 def create(path, schema:) dataset = new(path) dataset.create(normalize_schema(schema)) dataset end |
.open(path) ⇒ Object
14 15 16 17 18 |
# File 'lib/lancelot/dataset.rb', line 14 def open(path) dataset = new(path) dataset.open dataset end |
.open_or_create(path, schema:, mode: nil) ⇒ Object
20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 |
# File 'lib/lancelot/dataset.rb', line 20 def open_or_create(path, schema:, mode: nil) # Check if path exists if File.exist?(path) # Check if it's a file instead of directory if File.file?(path) if mode == "overwrite" # Remove the file and create dataset FileUtils.rm_f(path) create(path, schema: schema) else raise ArgumentError, "Path #{path} exists as a file, not a directory. " \ "Use mode: 'overwrite' to replace it, or choose a different path." end # Path exists as directory - check if it's a valid Lance dataset elsif File.exist?(File.join(path, "_versions")) # Valid dataset exists - open it open(path) elsif !Dir.empty?(path) # Non-empty directory that's not a Lance dataset if mode == "overwrite" # User explicitly wants to overwrite - remove and create new FileUtils.rm_rf(path) create(path, schema: schema) else # Fail safely - don't overwrite existing non-dataset directory raise ArgumentError, "Directory exists at #{path} but is not a valid Lance dataset. " \ "Use mode: 'overwrite' to replace it, or choose a different path." end else # Empty directory - safe to create dataset create(path, schema: schema) end else # Path doesn't exist - create new dataset create(path, schema: schema) end end |
Instance Method Details
#<<(document) ⇒ Object
88 89 90 91 |
# File 'lib/lancelot/dataset.rb', line 88 def <<(document) add_documents([document]) self end |
#==(other) ⇒ Object Also known as: eql?
198 199 200 |
# File 'lib/lancelot/dataset.rb', line 198 def ==(other) other.is_a?(Dataset) && other.path == path end |
#add_documents(documents) ⇒ Object
84 85 86 |
# File 'lib/lancelot/dataset.rb', line 84 def add_documents(documents) add_data(documents.map { |doc| normalize_document(doc) }) end |
#all ⇒ Object
108 109 110 |
# File 'lib/lancelot/dataset.rb', line 108 def all scan_all end |
#count(&block) ⇒ Object
Override Enumerable’s count to use our efficient count_rows when no block given
100 101 102 103 104 105 106 |
# File 'lib/lancelot/dataset.rb', line 100 def count(&block) if block_given? super(&block) # Use Enumerable's count with block else count_rows # Use our efficient count without block end end |
#each(&block) ⇒ Object
120 121 122 123 |
# File 'lib/lancelot/dataset.rb', line 120 def each(&block) return enum_for(:each) unless block_given? scan_all.each(&block) end |
#first(n = nil) ⇒ Object
112 113 114 115 116 117 118 |
# File 'lib/lancelot/dataset.rb', line 112 def first(n = nil) if n.nil? scan_limit(1).first else scan_limit(n) end end |
#hash ⇒ Object
203 204 205 |
# File 'lib/lancelot/dataset.rb', line 203 def hash path.hash end |
#hybrid_search(query, vector_column: "vector", text_column: nil, text_columns: nil, vector: nil, limit: 10, rrf_k: 60) ⇒ Object
157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 |
# File 'lib/lancelot/dataset.rb', line 157 def hybrid_search(query, vector_column: "vector", text_column: nil, text_columns: nil, vector: nil, limit: 10, rrf_k: 60) require 'lancelot/rank_fusion' result_lists = [] # Perform vector search if vector is provided if vector unless vector.is_a?(Array) raise ArgumentError, "Vector must be an array of numbers" end vector_results = vector_search(vector, column: vector_column, limit: limit * 2) result_lists << vector_results if vector_results.any? end # Perform text search if query is provided if query && !query.empty? text_results = text_search(query, column: text_column, columns: text_columns, limit: limit * 2) result_lists << text_results if text_results.any? end # Return empty array if no searches were performed return [] if result_lists.empty? # Return single result list if only one search was performed return result_lists.first[0...limit] if result_lists.size == 1 # Perform RRF fusion and limit results Lancelot::RankFusion.reciprocal_rank_fusion(result_lists, k: rrf_k)[0...limit] end |
#nearest_neighbors(vector, k: 10, column: "vector") ⇒ Object
135 136 137 |
# File 'lib/lancelot/dataset.rb', line 135 def nearest_neighbors(vector, k: 10, column: "vector") vector_search(vector, column: column, limit: k) end |
#size ⇒ Object Also known as: length
93 94 95 |
# File 'lib/lancelot/dataset.rb', line 93 def size count_rows end |
#text_search(query, column: nil, columns: nil, limit: 10) ⇒ Object
139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 |
# File 'lib/lancelot/dataset.rb', line 139 def text_search(query, column: nil, columns: nil, limit: 10) unless query.is_a?(String) raise ArgumentError, "Query must be a string" end if column && columns raise ArgumentError, "Cannot specify both column and columns" elsif columns # Multi-column search columns = Array(columns).map(&:to_s) _rust_multi_column_text_search(columns, query, limit) else # Single column search (default to "text" if not specified) column ||= "text" _rust_text_search(column.to_s, query, limit) end end |
#to_s ⇒ Object Also known as: inspect
193 194 195 |
# File 'lib/lancelot/dataset.rb', line 193 def to_s "#<Lancelot::Dataset path=\"#{path}\" count=#{count}>" end |
#vector_search(query_vector, column: "vector", limit: 10) ⇒ Object
127 128 129 130 131 132 133 |
# File 'lib/lancelot/dataset.rb', line 127 def vector_search(query_vector, column: "vector", limit: 10) unless query_vector.is_a?(Array) raise ArgumentError, "Query vector must be an array of numbers" end _rust_vector_search(column.to_s, query_vector, limit) end |
#where(filter_expression, limit: nil) ⇒ Object
189 190 191 |
# File 'lib/lancelot/dataset.rb', line 189 def where(filter_expression, limit: nil) filter_scan(filter_expression.to_s, limit) end |