Class: Ragnar::UmapProcessor

Inherits:
Object
  • Object
show all
Defined in:
lib/ragnar/umap_processor.rb

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(db_path: Ragnar::DEFAULT_DB_PATH, model_path: "umap_model.bin") ⇒ UmapProcessor

Returns a new instance of UmapProcessor.



8
9
10
11
12
# File 'lib/ragnar/umap_processor.rb', line 8

def initialize(db_path: Ragnar::DEFAULT_DB_PATH, model_path: "umap_model.bin")
  @database = Database.new(db_path)
  @model_path = model_path
  @umap_model = nil
end

Instance Attribute Details

#databaseObject (readonly)

Returns the value of attribute database.



6
7
8
# File 'lib/ragnar/umap_processor.rb', line 6

def database
  @database
end

#model_pathObject (readonly)

Returns the value of attribute model_path.



6
7
8
# File 'lib/ragnar/umap_processor.rb', line 6

def model_path
  @model_path
end

Class Method Details

.optimal_dimensions(original_dims, target_ratio: 0.1) ⇒ Object



344
345
346
347
348
349
350
# File 'lib/ragnar/umap_processor.rb', line 344

def self.optimal_dimensions(original_dims, target_ratio: 0.1)
  # Suggest optimal number of dimensions for reduction
  # Common heuristic: reduce to 10% of original dimensions
  # but keep at least 50 dimensions for good quality
  suggested = (original_dims * target_ratio).to_i
  [suggested, 50].max
end

Instance Method Details

#apply(batch_size: 100) ⇒ Object



223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
# File 'lib/ragnar/umap_processor.rb', line 223

def apply(batch_size: 100)
  # Load the trained UMAP model
  umap_model = load_umap_model
  
  puts "Applying UMAP transformation to database documents..."
  
  # Get all embeddings from database
  all_docs = @database.get_embeddings
  
  if all_docs.empty?
    puts "No embeddings found in database."
    return {
      processed: 0,
      skipped: 0,
      errors: 0
    }
  end
  
  puts "Found #{all_docs.size} documents in database"
  
  # Process in batches for memory efficiency
  processed_count = 0
  error_count = 0
  skipped_count = 0
  
  all_docs.each_slice(batch_size) do |batch|
    begin
      # Extract embeddings
      embeddings = batch.map { |d| d[:embedding] }
      
      # Validate embeddings
      valid_indices = []
      embeddings_to_transform = []
      
      embeddings.each_with_index do |emb, idx|
        if emb.nil? || !emb.is_a?(Array) || emb.empty?
          skipped_count += 1
          next
        end
        
        if emb.any? { |v| !v.is_a?(Numeric) || v.nan? || !v.finite? }
          skipped_count += 1
          next
        end
        
        valid_indices << idx
        embeddings_to_transform << emb
      end
      
      next if embeddings_to_transform.empty?
      
      # Transform using the loaded UMAP model
      reduced_embeddings = umap_model.transform(embeddings_to_transform)
      
      # Prepare updates for valid documents
      updates = valid_indices.map.with_index do |batch_idx, transform_idx|
        {
          id: batch[batch_idx][:id],
          reduced_embedding: reduced_embeddings[transform_idx]
        }
      end
      
      # Update database
      @database.update_reduced_embeddings(updates)
      processed_count += updates.size
      
      puts "  Processed batch: #{updates.size} documents transformed"
    rescue => e
      puts "  ⚠️  Error processing batch: #{e.message}"
      error_count += batch.size
    end
  end
  
  puts "\nUMAP application complete:"
  puts "  ✓ Processed: #{processed_count} documents"
  puts "  ⚠️  Skipped: #{skipped_count} documents (invalid embeddings)" if skipped_count > 0
  puts "  ❌ Errors: #{error_count} documents" if error_count > 0
  
  {
    processed: processed_count,
    skipped: skipped_count,
    errors: error_count
  }
end

#train(n_components: Ragnar::DEFAULT_REDUCED_DIMENSIONS, n_neighbors: 15, min_dist: 0.1) ⇒ Object



14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
# File 'lib/ragnar/umap_processor.rb', line 14

def train(n_components: Ragnar::DEFAULT_REDUCED_DIMENSIONS, n_neighbors: 15, min_dist: 0.1)
  puts "Loading embeddings from database..."
  
  # Get all embeddings
  docs = @database.get_embeddings
  
  if docs.empty?
    raise "No embeddings found in database. Please index some documents first."
  end
  
  embeddings = docs.map { |d| d[:embedding] }.compact
  
  if embeddings.empty?
    raise "No valid embeddings found in database."
  end
  
  puts "Found #{embeddings.size} embeddings"
  
  # Validate embeddings
  embedding_dims = embeddings.map(&:size).uniq
  if embedding_dims.size > 1
    puts "  ⚠️  Warning: Inconsistent embedding dimensions found: #{embedding_dims.inspect}"
    puts "     This may cause errors during UMAP training."
    # Filter to only embeddings with the most common dimension
    most_common_dim = embedding_dims.max_by { |dim| embeddings.count { |e| e.size == dim } }
    embeddings = embeddings.select { |e| e.size == most_common_dim }
    puts "     Using only embeddings with #{most_common_dim} dimensions (#{embeddings.size} embeddings)"
  end
  
  # Check for nil or invalid values
  invalid_count = 0
  nan_count = 0
  inf_count = 0
  
  valid_embeddings = embeddings.select do |embedding|
    if !embedding.is_a?(Array)
      invalid_count += 1
      false
    elsif embedding.any? { |v| !v.is_a?(Numeric) }
      invalid_count += 1
      false
    elsif embedding.any?(&:nan?)
      nan_count += 1
      false
    elsif embedding.any? { |v| !v.finite? }
      inf_count += 1
      false
    else
      true
    end
  end
  
  if valid_embeddings.size < embeddings.size
    puts "\n  ⚠️  Data quality issues detected:"
    puts "     • Invalid embeddings: #{invalid_count}" if invalid_count > 0
    puts "     • Embeddings with NaN: #{nan_count}" if nan_count > 0
    puts "     • Embeddings with Infinity: #{inf_count}" if inf_count > 0
    puts "     • Total removed: #{embeddings.size - valid_embeddings.size}"
    puts "     • Remaining valid: #{valid_embeddings.size}"
    
    embeddings = valid_embeddings
  end
  
  if embeddings.empty?
    raise "No valid embeddings found after validation.\n\n" \
          "All embeddings contain invalid values (NaN, Infinity, or non-numeric).\n" \
          "This suggests a problem with the embedding model or indexing process.\n\n" \
          "Please try:\n" \
          "  1. Re-indexing your documents: ragnar index <path> --force\n" \
          "  2. Using a different embedding model\n" \
          "  3. Checking your document content for unusual characters"
  end
  
  if embeddings.size < 10
    raise "Too few valid embeddings (#{embeddings.size}) for UMAP training.\n\n" \
          "UMAP requires at least 10 samples to work effectively.\n" \
          "Please index more documents or check for data quality issues."
  end
  
  # Adjust parameters based on the number of samples
  # UMAP requires n_neighbors < n_samples
  # Also, n_components should be less than n_samples for stability
  n_samples = embeddings.size
  
  if n_neighbors >= n_samples
    n_neighbors = [3, (n_samples - 1) / 2].max.to_i
    puts "  Adjusted n_neighbors to #{n_neighbors} (was #{15}, but only have #{n_samples} samples)"
  end
  
  if n_components >= n_samples
    n_components = [2, n_samples - 1].min
    puts "  Adjusted n_components to #{n_components} (was #{50}, but only have #{n_samples} samples)"
  end
  
  # Warn if we have very few samples
  if n_samples < 100
    puts "\n  ⚠️  Warning: UMAP works best with at least 100 samples."
    puts "     You currently have #{n_samples} samples."
    puts "     Consider indexing more documents for better results."
  end
  
  # Convert to matrix format for ClusterKit
  # ClusterKit expects a 2D array or Numo::NArray
  embedding_matrix = embeddings
  original_dims = embeddings.first.size
  
  # Ensure n_components is reasonable
  if n_components >= original_dims
    puts "  ⚠️  Warning: n_components (#{n_components}) >= original dimensions (#{original_dims})"
    n_components = [original_dims / 2, 50].min
    puts "     Reducing n_components to #{n_components}"
  end
  
  # For very high dimensional data, be more conservative
  if original_dims > 500 && n_components > 50
    puts "  ⚠️  Note: High dimensional data (#{original_dims}D) being reduced to #{n_components}D"
    puts "     Consider using n_components <= 50 for stability"
  end
  
  puts "\nTraining UMAP model..."
  puts "  Original dimensions: #{original_dims}"
  puts "  Target dimensions: #{n_components}"
  puts "  Neighbors: #{n_neighbors}"
  puts "  Min distance: #{min_dist}"
  
  # Perform the actual training using the class-based API
  puts "  Training UMAP model (this may take a moment)..."

  attempts = 0
  max_attempts = 3

  begin
    attempts += 1
    @umap_instance = ClusterKit::Dimensionality::UMAP.new(
      n_components: n_components,
      n_neighbors: n_neighbors
    )

    @reduced_embeddings = @umap_instance.fit_transform(embedding_matrix)

    puts "  ✓ UMAP training complete"
  rescue Exception => e
    # Catch Exception (not just StandardError) because Rust panics from
    # ClusterKit raise fatal errors that bypass the default rescue
    if e.message.include?("LapackInvalidValue") || e.message.include?("SGESDD") || e.message.include?("illegal value")
      if attempts < max_attempts
        # LAPACK SVD can fail with certain dimension combinations — retry with fewer components
        n_components = [n_components / 2, 2].max
        n_neighbors = [n_neighbors, n_components - 1, 3].min
        puts "  ⚠️  LAPACK error, retrying with n_components=#{n_components}, n_neighbors=#{n_neighbors} (attempt #{attempts + 1}/#{max_attempts})..."
        retry
      end

      raise RuntimeError, "\n❌ UMAP training failed due to a LAPACK numerical error.\n\n" \
        "This can happen with certain data/dimension combinations.\n" \
        "Try reducing n_components:\n" \
        "  ragnar umap train --n-components 10 --n-neighbors 5\n\n" \
        "Current parameters:\n" \
        "  • n_components: #{n_components}\n" \
        "  • n_neighbors: #{n_neighbors}\n" \
        "  • embeddings: #{embeddings.size} samples\n" \
        "  • dimensions: #{original_dims}\n"
    elsif e.message.include?("index out of bounds")
      raise RuntimeError, "\n❌ UMAP training failed\n\n" \
        "The UMAP algorithm encountered an index out of bounds error.\n\n" \
        "This typically happens when:\n" \
        "  • The embedding data contains invalid values (NaN, Infinity)\n" \
        "  • The parameters are incompatible with your data\n" \
        "  • There are duplicate or corrupted embeddings\n\n" \
        "Suggested solutions:\n" \
        "  1. Try with more conservative parameters:\n" \
        "     ragnar umap train --n-components 10 --n-neighbors 5\n\n" \
        "  2. Re-index your documents to regenerate embeddings:\n" \
        "     ragnar index <path> --force\n\n" \
        "  3. Check your embedding model configuration\n\n" \
        "Current parameters:\n" \
        "  • n_components: #{n_components}\n" \
        "  • n_neighbors: #{n_neighbors}\n" \
        "  • embeddings: #{embeddings.size} samples\n" \
        "  • dimensions: #{original_dims}\n"
    elsif e.is_a?(StandardError) || e.message.include?("unwrap")
      raise RuntimeError, "\n❌ UMAP training failed\n\n" \
        "Error: #{e.message}\n\n" \
        "This may be due to incompatible parameters or data issues.\n" \
        "Try using more conservative parameters:\n" \
        "  ragnar umap train --n-components 10 --n-neighbors 5\n"
    else
      # Re-raise non-application exceptions (Interrupt, SignalException, etc.)
      raise
    end
  end
  
  # Store the parameters for saving
  @model_params = {
    n_components: n_components,
    n_neighbors: n_neighbors,
    min_dist: min_dist
  }
  
  # Save the model
  save_model
  
  {
    embeddings_count: embeddings.size,
    original_dims: original_dims,
    reduced_dims: n_components
  }
end