Module: Bulkrax::CsvParser::CsvValidationHelpers

Includes:
CsvValidationHierarchy
Defined in:
app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb

Overview

Private helper methods for CsvValidation.

Instance Method Summary collapse

Methods included from CsvValidationHierarchy

#build_child_to_parents_map, #build_item_hash, #categorise_validation_item, #collect_relationship_ids, #external_ids, #extract_validation_items, #parse_relationship_field, #resolvable_ids

Instance Method Details

#append_missing_model_notice!(notices, headers, csv_data) ⇒ Object

Adds a file-level notice when the model column is absent or every row has a blank model value, indicating that the default work type will be used for all rows. When this notice is present the per-row default_work_type_used warnings are suppressed in the formatter — no need to repeat the same message for every row.



136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# File 'app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb', line 136

def append_missing_model_notice!(notices, headers, csv_data)
  default_model = Bulkrax.default_work_type
  return if default_model.blank?

  model_column_present = headers.map(&:to_s).include?('model')
  all_rows_blank = model_column_present && csv_data.all? { |r| r[:model].blank? }

  return if model_column_present && !all_rows_blank

  key_suffix = all_rows_blank ? 'column_empty' : 'column_missing'
  base_key   = 'bulkrax.importer.guided_import.validation.default_work_type_notice'
  notices << {
    field: 'model',
    default_work_type: default_model,
    message: I18n.t("#{base_key}.message_#{key_suffix}", default_work_type: default_model),
    suggestion: I18n.t("#{base_key}.suggestion_#{key_suffix}")
  }
end

#append_missing_source_id!(missing_required, headers, source_id_key, all_models) ⇒ Object

Adds a missing source_identifier entry to missing_required when the column is absent and fill_in_blank_source_identifiers is not configured.



125
126
127
128
129
130
# File 'app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb', line 125

def append_missing_source_id!(missing_required, headers, source_id_key, all_models)
  return if headers.map(&:to_s).include?(source_id_key.to_s)
  return if Bulkrax.fill_in_blank_source_identifiers.present?

  all_models.each { |model| missing_required << { model: model, field: source_id_key.to_s } }
end

#assemble_result(headers:, missing_required:, header_issues:, row_errors:, csv_data:, file_validator:, collections:, works:, file_sets:, notices: []) ⇒ Object

Assembles the final result hash returned to the guided import UI.



156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
# File 'app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb', line 156

def assemble_result(headers:, missing_required:, header_issues:, row_errors:, csv_data:, file_validator:, collections:, works:, file_sets:, notices: []) # rubocop:disable Metrics/ParameterLists
  is_valid, has_warnings = determine_validity(
    headers: headers, missing_required: missing_required, header_issues: header_issues,
    row_errors: row_errors, csv_data: csv_data, file_validator: file_validator, notices: notices
  )

  {
    headers: headers,
    missingRequired: missing_required,
    notices: notices,
    unrecognized: header_issues[:unrecognized],
    emptyColumns: header_issues[:empty_columns],
    rowCount: csv_data.length,
    isValid: is_valid,
    hasWarnings: has_warnings,
    rowErrors: row_errors,
    collections: collections,
    works: works,
    fileSets: file_sets,
    totalItems: csv_data.length,
    fileReferences: file_validator.count_references,
    missingFiles: file_validator.missing_files,
    foundFiles: file_validator.found_files_count,
    zipIncluded: file_validator.zip_included?
  }
end

#build_find_recordObject

Builds the find_record lambda used by row validators and hierarchy extraction.



205
206
207
208
209
210
211
# File 'app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb', line 205

def build_find_record
  all_mappings = Bulkrax.field_mappings['Bulkrax::CsvParser'] || {}
  work_identifier = all_mappings.find { |_k, v| v['source_identifier'] == true }&.first || 'source'
  work_identifier_search = Array.wrap(all_mappings.dig(work_identifier, 'search_field')).first&.to_s ||
                           "#{work_identifier}_sim"
  ->(id) { find_record_by_source_identifier(id, work_identifier, work_identifier_search) }
end

#build_parent_edges(csv_data, suffix_pattern, split_pattern) ⇒ Object



273
274
275
276
277
278
279
280
281
282
# File 'app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb', line 273

def build_parent_edges(csv_data, suffix_pattern, split_pattern)
  csv_data.each_with_object({}) do |record, graph|
    id = record[:source_identifier]
    next if id.blank?

    base_ids = split_or_single(record[:parent], split_pattern)
    suffix_ids = suffixed_values(record[:raw_row], suffix_pattern)
    graph[id] = (base_ids + suffix_ids).uniq
  end
end

#build_relationship_graph(csv_data, mappings) ⇒ Object

Builds a graph of { source_identifier => [parent_ids] } from all CSV records. Used by CircularReference validator to detect cycles across the whole CSV.

Parent edges are collected from both directions:

- explicit parent declarations (parents / parents_N columns)
- inverted child declarations (children / children_N columns), mirroring
  the normalisation done in importers_stepper.js#normalizeRelationships


262
263
264
265
266
267
268
269
270
271
# File 'app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb', line 262

def build_relationship_graph(csv_data, mappings)
  parent_column   = resolve_relationship_column(mappings, 'related_parents_field_mapping', 'parents')
  children_column = resolve_relationship_column(mappings, 'related_children_field_mapping', 'children')
  parent_suffix   = /\A#{Regexp.escape(parent_column)}_\d+\z/
  children_suffix = /\A#{Regexp.escape(children_column)}_\d+\z/

  graph = build_parent_edges(csv_data, parent_suffix, resolve_parent_split_pattern(mappings))
  invert_child_edges(graph, csv_data, children_suffix, resolve_children_split_pattern(mappings))
  graph
end

#build_valid_validation_headers(mapping_manager, field_analyzer, all_models, mappings, field_metadata) ⇒ Object



48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# File 'app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb', line 48

def build_valid_validation_headers(mapping_manager, field_analyzer, all_models, mappings, )
  svc = Bulkrax::CsvParser::ValidationContext.new(
    mapping_manager: mapping_manager,
    field_analyzer: field_analyzer,
    all_models: all_models,
    mappings: mappings
  )
  all_cols = CsvTemplate::ColumnBuilder.new(svc).all_columns
  # ColumnBuilder only emits the first `from:` alias per non-property key
  # (core/file/relationship). Accept every alias so a CSV using a
  # non-primary alias like `file` (when mappings are `from: ['item', 'file']`)
  # isn't flagged unrecognised. Property-level aliases are handled
  # separately by find_unrecognized_validation_headers via mapped_to_key.
  non_property_aliases = non_property_mapping_aliases(mappings)
  (all_cols + non_property_aliases).uniq - CsvTemplate::CsvBuilder::IGNORED_PROPERTIES
rescue StandardError => e
  Rails.logger.error("CsvParser.validate_csv: error building valid headers – #{e.message}")
  standard = %w[model source_identifier parents children file]
  model_fields = .values.flat_map { |m| m[:properties] }
                                      .map { |prop| mapping_manager.key_to_mapped_column(prop) }
  (standard + model_fields).uniq
end

#build_validation_field_metadata(all_models, field_analyzer) ⇒ Object



37
38
39
40
41
42
43
44
45
46
# File 'app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb', line 37

def (all_models, field_analyzer)
  all_models.each_with_object({}) do |model, hash|
    field_list = field_analyzer.find_or_create_field_list_for(model_name: model)
    hash[model] = {
      properties: field_list.dig(model, 'properties') || [],
      required_terms: field_list.dig(model, 'required_terms') || [],
      controlled_vocab_terms: field_list.dig(model, 'controlled_vocab_terms') || []
    }
  end
end

#determine_validity(headers:, missing_required:, header_issues:, row_errors:, csv_data:, file_validator:, notices:) ⇒ Object

Returns [is_valid, has_warnings] for the assembled result. rights_statement can be supplied on Step 2, so a CSV missing ONLY the rights_statement column is valid-with-warnings rather than a blocker; the display formatter styles that case as a warning accordion.



187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
# File 'app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb', line 187

def determine_validity(headers:, missing_required:, header_issues:, row_errors:, csv_data:, file_validator:, notices:) # rubocop:disable Metrics/ParameterLists
  row_error_entries   = row_errors.select { |e| e[:severity] == 'error' }
  row_warning_entries = row_errors.select { |e| e[:severity] == 'warning' }

  only_rights_missing = missing_required.present? &&
                        missing_required.all? { |h| h[:field].to_s == 'rights_statement' }
  blocking_missing_required = missing_required.any? && !only_rights_missing

  has_errors   = blocking_missing_required || headers.blank? || csv_data.empty? ||
                 file_validator.missing_files.any? || row_error_entries.any?
  has_warnings = header_issues[:unrecognized].any? || header_issues[:empty_columns].any? ||
                 file_validator.possible_missing_files? || row_warning_entries.any? ||
                 notices.any? || only_rights_missing

  [!has_errors, has_warnings]
end

#find_empty_column_positions(headers, raw_csv) ⇒ Object



115
116
117
118
119
120
121
# File 'app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb', line 115

def find_empty_column_positions(headers, raw_csv)
  headers.each_with_index.filter_map do |h, i|
    next if h.present?
    has_data = raw_csv.any? { |row| row.fields[i].present? }
    i + 1 if has_data
  end
end

#find_missing_required_headers(headers, field_metadata, mapping_manager) ⇒ Object



85
86
87
88
89
90
91
92
93
94
# File 'app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb', line 85

def find_missing_required_headers(headers, , mapping_manager)
  csv_keys = headers.map { |h| mapping_manager.mapped_to_key(h).sub(/_\d+\z/, '') }.uniq
  missing = []
  .each do |model, meta|
    (meta[:required_terms] || []).each do |field|
      missing << { model: model, field: field } unless csv_keys.include?(field)
    end
  end
  missing.uniq
end

#find_record_by_source_identifier(identifier, work_identifier, work_identifier_search) ⇒ Boolean

Attempt to locate an existing repository record by its identifier. The identifier may be a repository object ID or a source_identifier property value. Checks the repository directly (by ID, then by Solr property search) — a Bulkrax Entry record alone is not sufficient, as the object may never have been created.

Parameters:

  • identifier (String)
  • work_identifier (String)

    the source_identifier property name (e.g. “source”)

  • work_identifier_search (String)

    the Solr field for source_identifier (e.g. “source_sim”)

Returns:

  • (Boolean)

    true if a matching repository object is found



222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
# File 'app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb', line 222

def find_record_by_source_identifier(identifier, work_identifier, work_identifier_search)
  return false if identifier.blank?

  return true if Bulkrax.object_factory.find_or_nil(identifier).present?

  [Bulkrax.collection_model_class, *Bulkrax.curation_concerns].any? do |klass|
    Bulkrax.object_factory.search_by_property(
      value: identifier,
      klass: klass,
      search_field: work_identifier_search,
      name_field: work_identifier
    ).present?
  end
rescue StandardError
  false
end

#find_unrecognized_validation_headers(headers, valid_headers, mapping_manager: nil, field_metadata: nil) ⇒ Object

A header is considered recognised if it appears in valid_headers or if it matches any alias in a known property’s ‘from` array. The real importer (CsvParser#missing_elements) scans every `from` entry when matching incoming columns, so the validator has to use the same rule — otherwise a CSV that imports cleanly gets flagged for columns like `creator` when the mapping declares `creator: { from: [’author’, ‘creator’] }‘.



102
103
104
105
106
107
108
109
110
111
112
113
# File 'app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb', line 102

def find_unrecognized_validation_headers(headers, valid_headers, mapping_manager: nil, field_metadata: nil)
  known_property_keys = ( || {}).values.flat_map { |m| Array(m[:properties]) }.to_set
  checker = DidYouMean::SpellChecker.new(dictionary: valid_headers)
  unrecognized = headers.reject do |h|
    next true if h.blank?
    base = h.sub(/_\d+\z/, '')
    next true if valid_headers.include?(h) || valid_headers.include?(base)
    mapped_key = mapping_manager&.mapped_to_key(base)
    mapped_key && known_property_keys.include?(mapped_key)
  end
  unrecognized.index_with { |h| checker.correct(h).first }
end

#invert_child_edges(graph, csv_data, suffix_pattern, split_pattern) ⇒ Object



284
285
286
287
288
289
290
291
292
293
294
295
296
# File 'app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb', line 284

def invert_child_edges(graph, csv_data, suffix_pattern, split_pattern)
  csv_data.each do |record|
    id = record[:source_identifier]
    next if id.blank?

    child_ids = split_or_single(record[:children], split_pattern) +
                suffixed_values(record[:raw_row], suffix_pattern)
    child_ids.each do |child_id|
      graph[child_id] ||= []
      graph[child_id] << id unless graph[child_id].include?(id)
    end
  end
end

#non_property_mapping_aliases(mappings) ⇒ Object

Returns every ‘from:` alias for mapping keys that describe non-property columns (core/file/relationship). These keys are fixed by the descriptor rather than discovered per-model, so every alias is unambiguously valid.



74
75
76
77
78
79
80
81
82
83
# File 'app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb', line 74

def non_property_mapping_aliases(mappings)
  descriptor = CsvTemplate::ColumnDescriptor.new
  non_property_keys = descriptor.core_columns +
                      CsvTemplate::ColumnDescriptor::COLUMN_DESCRIPTIONS[:files].flat_map(&:keys) +
                      CsvTemplate::ColumnDescriptor::COLUMN_DESCRIPTIONS[:relationships].flat_map(&:keys)
  non_property_keys.flat_map do |key|
    entry = mappings[key]
    entry.is_a?(Hash) ? Array(entry["from"]) : []
  end
end

#parse_validation_rows(raw_csv, source_id_key, parent_key, children_key, file_key) ⇒ Object

Parse rows from a CsvEntry.read_data result into the canonical record shape. CsvEntry.read_data returns CSV::Row objects with symbol headers; blank rows are already filtered by CsvWrapper.



19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# File 'app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb', line 19

def parse_validation_rows(raw_csv, source_id_key, parent_key, children_key, file_key)
  raw_csv.map do |row|
    # CSV::Row#to_h converts symbol headers → string-keyed hash
    row_hash = row.to_h.transform_keys(&:to_s)
    {
      source_identifier: row[source_id_key],
      model: row[:model],
      parent: row[parent_key],
      children: row[children_key],
      file: row[file_key],
      raw_row: row_hash
    }
  end
rescue StandardError => e
  Rails.logger.error("CsvParser.validate_csv: error parsing rows – #{e.message}")
  []
end

#resolve_children_split_pattern(mappings) ⇒ Object



251
252
253
# File 'app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb', line 251

def resolve_children_split_pattern(mappings)
  Bulkrax::SplitPatternCoercion.coerce(mappings.dig('children', 'split') || mappings.dig(:children, :split))
end

#resolve_parent_split_pattern(mappings) ⇒ Object



247
248
249
# File 'app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb', line 247

def resolve_parent_split_pattern(mappings)
  Bulkrax::SplitPatternCoercion.coerce(mappings.dig('parents', 'split') || mappings.dig(:parents, :split))
end

#resolve_relationship_column(mappings, flag, default) ⇒ Object

Returns the raw CSV column name (String) for a relationship field. Looks for the mapping entry flagged with flag and returns its first from value, falling back to default when none is found.



242
243
244
245
# File 'app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb', line 242

def resolve_relationship_column(mappings, flag, default)
  entry = mappings.find { |_k, v| v.is_a?(Hash) && v[flag] }
  entry&.last&.dig('from')&.first || default
end

#resolve_validation_key(mapping_manager, key: nil, flag: nil, default:) ⇒ Object

Resolve a symbol key from mappings for use as a record hash key. Returns a Symbol matching the parser’s symbol-keyed record hash.



11
12
13
14
# File 'app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb', line 11

def resolve_validation_key(mapping_manager, key: nil, flag: nil, default:)
  options = mapping_manager.resolve_column_name(key: key, flag: flag, default: default.to_s)
  options.first&.to_sym || default
end

#split_or_single(value, split_pattern) ⇒ Object



298
299
300
301
302
303
304
305
306
307
# File 'app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb', line 298

def split_or_single(value, split_pattern)
  coerced = Bulkrax::SplitPatternCoercion.coerce(split_pattern)
  if coerced
    value.to_s.split(coerced).map(&:strip).reject(&:blank?)
  elsif value.present?
    [value.to_s.strip]
  else
    []
  end
end

#suffixed_values(raw_row, suffix_pattern) ⇒ Object



309
310
311
312
# File 'app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb', line 309

def suffixed_values(raw_row, suffix_pattern)
  raw_row.select { |k, _| k.to_s.match?(suffix_pattern) }
         .values.map(&:to_s).map(&:strip).reject(&:blank?)
end