14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
|
# File 'lib/smart_csv_import/strategies/vector.rb', line 14
def match(csv_headers:, form_class:, sample_rows: [])
field_definitions = form_class.csv_fields
return {} if field_definitions.empty?
field_names = field_definitions.keys
humanized_names = field_names.map { |name| name.to_s.tr("_", " ") }
humanized_index = humanized_names.each_with_index.to_h { |name, i| [name.downcase, field_names[i]] }
results = {}
needs_embedding = []
.each do ||
normalized = HeaderNormalizer.normalize()
if (field = humanized_index[normalized.downcase])
log_info("Exact match: '#{}' → :#{field} (normalized: '#{normalized}')")
results[] = MatchResult.matched(
target_field: field,
confidence: 1.0,
strategy_name: "vector"
)
else
needs_embedding <<
end
end
return results if needs_embedding.empty?
field_embeddings = fetch_field_embeddings(humanized_names, field_names)
normalized_remaining = needs_embedding.map { |h| HeaderNormalizer.normalize(h) }
= compute_embeddings(normalized_remaining.uniq)
= needs_embedding.zip(normalized_remaining).to_h do |orig, norm|
[orig, [norm]]
end
score_matrix = needs_embedding.each_with_object({}) do |, matrix|
= []
next unless
matrix[] = field_names.each_with_object({}) do |field_name, scores|
field_vec = field_embeddings[field_name]
scores[field_name] = CosineSimilarity.call(, field_vec) if field_vec
end
end
best_field_for = score_matrix.transform_values { |scores| scores.max_by { |_, s| s }&.first }
= field_names.each_with_object({}) do |field_name, bh|
bh[field_name] = score_matrix.max_by { |_, scores| scores[field_name] || -1 }&.first
end
needs_embedding.each do ||
best_field = best_field_for[]
next unless best_field
score = score_matrix[][best_field]
unless [best_field] ==
log_info("Non-mutual: '#{}' → :#{best_field} (#{score.round(4)}) — field's best header is '#{[best_field]}'")
next
end
results[] = MatchResult.matched(
target_field: best_field,
confidence: score.round(4),
strategy_name: "vector"
)
end
results
rescue RubyLLM::Error, Faraday::Error => e
log_error("Vector strategy errored (#{e.class}): #{e.message}")
StrategyFailure.new(strategy_name: "vector", error: e)
end
|