Class: Ragnar::UmapProcessor
- Inherits:
-
Object
- Object
- Ragnar::UmapProcessor
- Defined in:
- lib/ragnar/umap_processor.rb
Instance Attribute Summary collapse
-
#database ⇒ Object
readonly
Returns the value of attribute database.
-
#model_path ⇒ Object
readonly
Returns the value of attribute model_path.
Class Method Summary collapse
Instance Method Summary collapse
- #apply(batch_size: 100) ⇒ Object
-
#initialize(db_path: Ragnar::DEFAULT_DB_PATH, model_path: "umap_model.bin") ⇒ UmapProcessor
constructor
A new instance of UmapProcessor.
- #train(n_components: Ragnar::DEFAULT_REDUCED_DIMENSIONS, n_neighbors: 15, min_dist: 0.1) ⇒ Object
Constructor Details
#initialize(db_path: Ragnar::DEFAULT_DB_PATH, model_path: "umap_model.bin") ⇒ UmapProcessor
Returns a new instance of UmapProcessor.
8 9 10 11 12 |
# File 'lib/ragnar/umap_processor.rb', line 8 def initialize(db_path: Ragnar::DEFAULT_DB_PATH, model_path: "umap_model.bin") @database = Database.new(db_path) @model_path = model_path @umap_model = nil end |
Instance Attribute Details
#database ⇒ Object (readonly)
Returns the value of attribute database.
6 7 8 |
# File 'lib/ragnar/umap_processor.rb', line 6 def database @database end |
#model_path ⇒ Object (readonly)
Returns the value of attribute model_path.
6 7 8 |
# File 'lib/ragnar/umap_processor.rb', line 6 def model_path @model_path end |
Class Method Details
.optimal_dimensions(original_dims, target_ratio: 0.1) ⇒ Object
344 345 346 347 348 349 350 |
# File 'lib/ragnar/umap_processor.rb', line 344 def self.optimal_dimensions(original_dims, target_ratio: 0.1) # Suggest optimal number of dimensions for reduction # Common heuristic: reduce to 10% of original dimensions # but keep at least 50 dimensions for good quality suggested = (original_dims * target_ratio).to_i [suggested, 50].max end |
Instance Method Details
#apply(batch_size: 100) ⇒ Object
223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 |
# File 'lib/ragnar/umap_processor.rb', line 223 def apply(batch_size: 100) # Load the trained UMAP model umap_model = load_umap_model puts "Applying UMAP transformation to database documents..." # Get all embeddings from database all_docs = @database. if all_docs.empty? puts "No embeddings found in database." return { processed: 0, skipped: 0, errors: 0 } end puts "Found #{all_docs.size} documents in database" # Process in batches for memory efficiency processed_count = 0 error_count = 0 skipped_count = 0 all_docs.each_slice(batch_size) do |batch| begin # Extract embeddings = batch.map { |d| d[:embedding] } # Validate embeddings valid_indices = [] = [] .each_with_index do |emb, idx| if emb.nil? || !emb.is_a?(Array) || emb.empty? skipped_count += 1 next end if emb.any? { |v| !v.is_a?(Numeric) || v.nan? || !v.finite? } skipped_count += 1 next end valid_indices << idx << emb end next if .empty? # Transform using the loaded UMAP model = umap_model.transform() # Prepare updates for valid documents updates = valid_indices.map.with_index do |batch_idx, transform_idx| { id: batch[batch_idx][:id], reduced_embedding: [transform_idx] } end # Update database @database.(updates) processed_count += updates.size puts " Processed batch: #{updates.size} documents transformed" rescue => e puts " ⚠️ Error processing batch: #{e.}" error_count += batch.size end end puts "\nUMAP application complete:" puts " ✓ Processed: #{processed_count} documents" puts " ⚠️ Skipped: #{skipped_count} documents (invalid embeddings)" if skipped_count > 0 puts " ❌ Errors: #{error_count} documents" if error_count > 0 { processed: processed_count, skipped: skipped_count, errors: error_count } end |
#train(n_components: Ragnar::DEFAULT_REDUCED_DIMENSIONS, n_neighbors: 15, min_dist: 0.1) ⇒ Object
14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 |
# File 'lib/ragnar/umap_processor.rb', line 14 def train(n_components: Ragnar::DEFAULT_REDUCED_DIMENSIONS, n_neighbors: 15, min_dist: 0.1) puts "Loading embeddings from database..." # Get all embeddings docs = @database. if docs.empty? raise "No embeddings found in database. Please index some documents first." end = docs.map { |d| d[:embedding] }.compact if .empty? raise "No valid embeddings found in database." end puts "Found #{.size} embeddings" # Validate embeddings = .map(&:size).uniq if .size > 1 puts " ⚠️ Warning: Inconsistent embedding dimensions found: #{.inspect}" puts " This may cause errors during UMAP training." # Filter to only embeddings with the most common dimension most_common_dim = .max_by { |dim| .count { |e| e.size == dim } } = .select { |e| e.size == most_common_dim } puts " Using only embeddings with #{most_common_dim} dimensions (#{.size} embeddings)" end # Check for nil or invalid values invalid_count = 0 nan_count = 0 inf_count = 0 = .select do || if !.is_a?(Array) invalid_count += 1 false elsif .any? { |v| !v.is_a?(Numeric) } invalid_count += 1 false elsif .any?(&:nan?) nan_count += 1 false elsif .any? { |v| !v.finite? } inf_count += 1 false else true end end if .size < .size puts "\n ⚠️ Data quality issues detected:" puts " • Invalid embeddings: #{invalid_count}" if invalid_count > 0 puts " • Embeddings with NaN: #{nan_count}" if nan_count > 0 puts " • Embeddings with Infinity: #{inf_count}" if inf_count > 0 puts " • Total removed: #{.size - .size}" puts " • Remaining valid: #{.size}" = end if .empty? raise "No valid embeddings found after validation.\n\n" \ "All embeddings contain invalid values (NaN, Infinity, or non-numeric).\n" \ "This suggests a problem with the embedding model or indexing process.\n\n" \ "Please try:\n" \ " 1. Re-indexing your documents: ragnar index <path> --force\n" \ " 2. Using a different embedding model\n" \ " 3. Checking your document content for unusual characters" end if .size < 10 raise "Too few valid embeddings (#{.size}) for UMAP training.\n\n" \ "UMAP requires at least 10 samples to work effectively.\n" \ "Please index more documents or check for data quality issues." end # Adjust parameters based on the number of samples # UMAP requires n_neighbors < n_samples # Also, n_components should be less than n_samples for stability n_samples = .size if n_neighbors >= n_samples n_neighbors = [3, (n_samples - 1) / 2].max.to_i puts " Adjusted n_neighbors to #{n_neighbors} (was #{15}, but only have #{n_samples} samples)" end if n_components >= n_samples n_components = [2, n_samples - 1].min puts " Adjusted n_components to #{n_components} (was #{50}, but only have #{n_samples} samples)" end # Warn if we have very few samples if n_samples < 100 puts "\n ⚠️ Warning: UMAP works best with at least 100 samples." puts " You currently have #{n_samples} samples." puts " Consider indexing more documents for better results." end # Convert to matrix format for ClusterKit # ClusterKit expects a 2D array or Numo::NArray = original_dims = .first.size # Ensure n_components is reasonable if n_components >= original_dims puts " ⚠️ Warning: n_components (#{n_components}) >= original dimensions (#{original_dims})" n_components = [original_dims / 2, 50].min puts " Reducing n_components to #{n_components}" end # For very high dimensional data, be more conservative if original_dims > 500 && n_components > 50 puts " ⚠️ Note: High dimensional data (#{original_dims}D) being reduced to #{n_components}D" puts " Consider using n_components <= 50 for stability" end puts "\nTraining UMAP model..." puts " Original dimensions: #{original_dims}" puts " Target dimensions: #{n_components}" puts " Neighbors: #{n_neighbors}" puts " Min distance: #{min_dist}" # Perform the actual training using the class-based API puts " Training UMAP model (this may take a moment)..." attempts = 0 max_attempts = 3 begin attempts += 1 @umap_instance = ClusterKit::Dimensionality::UMAP.new( n_components: n_components, n_neighbors: n_neighbors ) @reduced_embeddings = @umap_instance.fit_transform() puts " ✓ UMAP training complete" rescue Exception => e # Catch Exception (not just StandardError) because Rust panics from # ClusterKit raise fatal errors that bypass the default rescue if e..include?("LapackInvalidValue") || e..include?("SGESDD") || e..include?("illegal value") if attempts < max_attempts # LAPACK SVD can fail with certain dimension combinations — retry with fewer components n_components = [n_components / 2, 2].max n_neighbors = [n_neighbors, n_components - 1, 3].min puts " ⚠️ LAPACK error, retrying with n_components=#{n_components}, n_neighbors=#{n_neighbors} (attempt #{attempts + 1}/#{max_attempts})..." retry end raise RuntimeError, "\n❌ UMAP training failed due to a LAPACK numerical error.\n\n" \ "This can happen with certain data/dimension combinations.\n" \ "Try reducing n_components:\n" \ " ragnar umap train --n-components 10 --n-neighbors 5\n\n" \ "Current parameters:\n" \ " • n_components: #{n_components}\n" \ " • n_neighbors: #{n_neighbors}\n" \ " • embeddings: #{.size} samples\n" \ " • dimensions: #{original_dims}\n" elsif e..include?("index out of bounds") raise RuntimeError, "\n❌ UMAP training failed\n\n" \ "The UMAP algorithm encountered an index out of bounds error.\n\n" \ "This typically happens when:\n" \ " • The embedding data contains invalid values (NaN, Infinity)\n" \ " • The parameters are incompatible with your data\n" \ " • There are duplicate or corrupted embeddings\n\n" \ "Suggested solutions:\n" \ " 1. Try with more conservative parameters:\n" \ " ragnar umap train --n-components 10 --n-neighbors 5\n\n" \ " 2. Re-index your documents to regenerate embeddings:\n" \ " ragnar index <path> --force\n\n" \ " 3. Check your embedding model configuration\n\n" \ "Current parameters:\n" \ " • n_components: #{n_components}\n" \ " • n_neighbors: #{n_neighbors}\n" \ " • embeddings: #{.size} samples\n" \ " • dimensions: #{original_dims}\n" elsif e.is_a?(StandardError) || e..include?("unwrap") raise RuntimeError, "\n❌ UMAP training failed\n\n" \ "Error: #{e.}\n\n" \ "This may be due to incompatible parameters or data issues.\n" \ "Try using more conservative parameters:\n" \ " ragnar umap train --n-components 10 --n-neighbors 5\n" else # Re-raise non-application exceptions (Interrupt, SignalException, etc.) raise end end # Store the parameters for saving @model_params = { n_components: n_components, n_neighbors: n_neighbors, min_dist: min_dist } # Save the model save_model { embeddings_count: .size, original_dims: original_dims, reduced_dims: n_components } end |