Module: Bulkrax::CsvParser::CsvValidationHelpers

Includes:
CsvValidationHierarchy
Defined in:
app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb

Overview

Private helper methods for CsvValidation.

Instance Method Summary collapse

Methods included from CsvValidationHierarchy

#build_child_to_parents_map, #build_item_hash, #categorise_validation_item, #collect_relationship_ids, #external_ids, #extract_validation_items, #parse_relationship_field, #resolvable_ids

Instance Method Details

#append_missing_model_notice!(notices, headers, csv_data) ⇒ Object

Adds a file-level notice when the model column is absent or every row has a blank model value, indicating that the default work type will be used for all rows. When this notice is present the per-row default_work_type_used warnings are suppressed in the formatter — no need to repeat the same message for every row.



104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# File 'app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb', line 104

def append_missing_model_notice!(notices, headers, csv_data)
  default_model = Bulkrax.default_work_type
  return if default_model.blank?

  model_column_present = headers.map(&:to_s).include?('model')
  all_rows_blank = model_column_present && csv_data.all? { |r| r[:model].blank? }

  return if model_column_present && !all_rows_blank

  key_suffix = all_rows_blank ? 'column_empty' : 'column_missing'
  base_key   = 'bulkrax.importer.guided_import.validation.default_work_type_notice'
  notices << {
    field: 'model',
    default_work_type: default_model,
    message: I18n.t("#{base_key}.message_#{key_suffix}", default_work_type: default_model),
    suggestion: I18n.t("#{base_key}.suggestion_#{key_suffix}")
  }
end

#append_missing_source_id!(missing_required, headers, source_id_key, all_models) ⇒ Object

Adds a missing source_identifier entry to missing_required when the column is absent and fill_in_blank_source_identifiers is not configured.



93
94
95
96
97
98
# File 'app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb', line 93

def append_missing_source_id!(missing_required, headers, source_id_key, all_models)
  return if headers.map(&:to_s).include?(source_id_key.to_s)
  return if Bulkrax.fill_in_blank_source_identifiers.present?

  all_models.each { |model| missing_required << { model: model, field: source_id_key.to_s } }
end

#apply_rights_statement_validation_override!(result, missing_required) ⇒ Object



123
124
125
126
127
128
129
130
131
132
# File 'app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb', line 123

def apply_rights_statement_validation_override!(result, missing_required)
  only_rights = missing_required.present? &&
                missing_required.all? { |h| h[:field].to_s == 'rights_statement' }
  return unless only_rights && !result[:isValid]
  return if result[:headers].blank?
  return if result[:missingFiles]&.any?

  result[:isValid]     = true
  result[:hasWarnings] = true
end

#assemble_result(headers:, missing_required:, header_issues:, row_errors:, csv_data:, file_validator:, collections:, works:, file_sets:, notices: []) ⇒ Object

Assembles the final result hash returned to the guided import UI.



135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# File 'app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb', line 135

def assemble_result(headers:, missing_required:, header_issues:, row_errors:, csv_data:, file_validator:, collections:, works:, file_sets:, notices: []) # rubocop:disable Metrics/ParameterLists
  row_error_entries   = row_errors.select { |e| e[:severity] == 'error' }
  row_warning_entries = row_errors.select { |e| e[:severity] == 'warning' }
  has_errors   = missing_required.any? || headers.blank? || csv_data.empty? ||
                 file_validator.missing_files.any? || row_error_entries.any?
  has_warnings = header_issues[:unrecognized].any? || header_issues[:empty_columns].any? ||
                 file_validator.possible_missing_files? || row_warning_entries.any? || notices.any?

  {
    headers: headers,
    missingRequired: missing_required,
    notices: notices,
    unrecognized: header_issues[:unrecognized],
    emptyColumns: header_issues[:empty_columns],
    rowCount: csv_data.length,
    isValid: !has_errors,
    hasWarnings: has_warnings,
    rowErrors: row_errors,
    collections: collections,
    works: works,
    fileSets: file_sets,
    totalItems: csv_data.length,
    fileReferences: file_validator.count_references,
    missingFiles: file_validator.missing_files,
    foundFiles: file_validator.found_files_count,
    zipIncluded: file_validator.zip_included?
  }
end

#build_find_recordObject

Builds the find_record lambda used by row validators and hierarchy extraction.



165
166
167
168
169
170
171
# File 'app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb', line 165

def build_find_record
  all_mappings = Bulkrax.field_mappings['Bulkrax::CsvParser'] || {}
  work_identifier = all_mappings.find { |_k, v| v['source_identifier'] == true }&.first || 'source'
  work_identifier_search = Array.wrap(all_mappings.dig(work_identifier, 'search_field')).first&.to_s ||
                           "#{work_identifier}_sim"
  ->(id) { find_record_by_source_identifier(id, work_identifier, work_identifier_search) }
end

#build_parent_edges(csv_data, suffix_pattern, split_pattern) ⇒ Object



241
242
243
244
245
246
247
248
249
250
# File 'app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb', line 241

def build_parent_edges(csv_data, suffix_pattern, split_pattern)
  csv_data.each_with_object({}) do |record, graph|
    id = record[:source_identifier]
    next if id.blank?

    base_ids = split_or_single(record[:parent], split_pattern)
    suffix_ids = suffixed_values(record[:raw_row], suffix_pattern)
    graph[id] = (base_ids + suffix_ids).uniq
  end
end

#build_relationship_graph(csv_data, mappings) ⇒ Object

Builds a graph of { source_identifier => [parent_ids] } from all CSV records. Used by CircularReference validator to detect cycles across the whole CSV.

Parent edges are collected from both directions:

- explicit parent declarations (parents / parents_N columns)
- inverted child declarations (children / children_N columns), mirroring
  the normalisation done in importers_stepper.js#normalizeRelationships


230
231
232
233
234
235
236
237
238
239
# File 'app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb', line 230

def build_relationship_graph(csv_data, mappings)
  parent_column   = resolve_relationship_column(mappings, 'related_parents_field_mapping', 'parents')
  children_column = resolve_relationship_column(mappings, 'related_children_field_mapping', 'children')
  parent_suffix   = /\A#{Regexp.escape(parent_column)}_\d+\z/
  children_suffix = /\A#{Regexp.escape(children_column)}_\d+\z/

  graph = build_parent_edges(csv_data, parent_suffix, resolve_parent_split_pattern(mappings))
  invert_child_edges(graph, csv_data, children_suffix, resolve_children_split_pattern(mappings))
  graph
end

#build_valid_validation_headers(mapping_manager, field_analyzer, all_models, mappings, field_metadata) ⇒ Object



48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# File 'app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb', line 48

def build_valid_validation_headers(mapping_manager, field_analyzer, all_models, mappings, )
  svc = ValidationContext.new(
    mapping_manager: mapping_manager,
    field_analyzer: field_analyzer,
    all_models: all_models,
    mappings: mappings
  )
  all_cols = CsvTemplate::ColumnBuilder.new(svc).all_columns
  all_cols - CsvTemplate::CsvBuilder::IGNORED_PROPERTIES
rescue StandardError => e
  Rails.logger.error("CsvParser.validate_csv: error building valid headers – #{e.message}")
  standard = %w[model source_identifier parents children file]
  model_fields = .values.flat_map { |m| m[:properties] }
                                      .map { |prop| mapping_manager.key_to_mapped_column(prop) }
  (standard + model_fields).uniq
end

#build_validation_field_metadata(all_models, field_analyzer) ⇒ Object



37
38
39
40
41
42
43
44
45
46
# File 'app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb', line 37

def (all_models, field_analyzer)
  all_models.each_with_object({}) do |model, hash|
    field_list = field_analyzer.find_or_create_field_list_for(model_name: model)
    hash[model] = {
      properties: field_list.dig(model, 'properties') || [],
      required_terms: field_list.dig(model, 'required_terms') || [],
      controlled_vocab_terms: field_list.dig(model, 'controlled_vocab_terms') || []
    }
  end
end

#find_empty_column_positions(headers, raw_csv) ⇒ Object



83
84
85
86
87
88
89
# File 'app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb', line 83

def find_empty_column_positions(headers, raw_csv)
  headers.each_with_index.filter_map do |h, i|
    next if h.present?
    has_data = raw_csv.any? { |row| row.fields[i].present? }
    i + 1 if has_data
  end
end

#find_missing_required_headers(headers, field_metadata, mapping_manager) ⇒ Object



65
66
67
68
69
70
71
72
73
74
# File 'app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb', line 65

def find_missing_required_headers(headers, , mapping_manager)
  csv_keys = headers.map { |h| mapping_manager.mapped_to_key(h).sub(/_\d+\z/, '') }.uniq
  missing = []
  .each do |model, meta|
    (meta[:required_terms] || []).each do |field|
      missing << { model: model, field: field } unless csv_keys.include?(field)
    end
  end
  missing.uniq
end

#find_record_by_source_identifier(identifier, work_identifier, work_identifier_search) ⇒ Boolean

Attempt to locate an existing repository record by its identifier. The identifier may be a repository object ID or a source_identifier property value. Checks the repository directly (by ID, then by Solr property search) — a Bulkrax Entry record alone is not sufficient, as the object may never have been created.

Parameters:

  • identifier (String)
  • work_identifier (String)

    the source_identifier property name (e.g. “source”)

  • work_identifier_search (String)

    the Solr field for source_identifier (e.g. “source_sim”)

Returns:

  • (Boolean)

    true if a matching repository object is found



182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
# File 'app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb', line 182

def find_record_by_source_identifier(identifier, work_identifier, work_identifier_search)
  return false if identifier.blank?

  return true if Bulkrax.object_factory.find_or_nil(identifier).present?

  [Bulkrax.collection_model_class, *Bulkrax.curation_concerns].any? do |klass|
    Bulkrax.object_factory.search_by_property(
      value: identifier,
      klass: klass,
      search_field: work_identifier_search,
      name_field: work_identifier
    ).present?
  end
rescue StandardError
  false
end

#find_unrecognized_validation_headers(headers, valid_headers) ⇒ Object



76
77
78
79
80
81
# File 'app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb', line 76

def find_unrecognized_validation_headers(headers, valid_headers)
  checker = DidYouMean::SpellChecker.new(dictionary: valid_headers)
  headers
    .reject { |h| h.blank? || valid_headers.include?(h) || valid_headers.include?(h.sub(/_\d+\z/, '')) }
    .index_with { |h| checker.correct(h).first }
end

#invert_child_edges(graph, csv_data, suffix_pattern, split_pattern) ⇒ Object



252
253
254
255
256
257
258
259
260
261
262
263
264
# File 'app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb', line 252

def invert_child_edges(graph, csv_data, suffix_pattern, split_pattern)
  csv_data.each do |record|
    id = record[:source_identifier]
    next if id.blank?

    child_ids = split_or_single(record[:children], split_pattern) +
                suffixed_values(record[:raw_row], suffix_pattern)
    child_ids.each do |child_id|
      graph[child_id] ||= []
      graph[child_id] << id unless graph[child_id].include?(id)
    end
  end
end

#parse_validation_rows(raw_csv, source_id_key, parent_key, children_key, file_key) ⇒ Object

Parse rows from a CsvEntry.read_data result into the canonical record shape. CsvEntry.read_data returns CSV::Row objects with symbol headers; blank rows are already filtered by CsvWrapper.



19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# File 'app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb', line 19

def parse_validation_rows(raw_csv, source_id_key, parent_key, children_key, file_key)
  raw_csv.map do |row|
    # CSV::Row#to_h converts symbol headers → string-keyed hash
    row_hash = row.to_h.transform_keys(&:to_s)
    {
      source_identifier: row[source_id_key],
      model: row[:model],
      parent: row[parent_key],
      children: row[children_key],
      file: row[file_key],
      raw_row: row_hash
    }
  end
rescue StandardError => e
  Rails.logger.error("CsvParser.validate_csv: error parsing rows – #{e.message}")
  []
end

#resolve_children_split_pattern(mappings) ⇒ Object



215
216
217
218
219
220
221
# File 'app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb', line 215

def resolve_children_split_pattern(mappings)
  split_val = mappings.dig('children', 'split') || mappings.dig(:children, :split)
  return nil if split_val.blank?
  return Bulkrax::DEFAULT_MULTI_VALUE_ELEMENT_SPLIT_ON if split_val == true

  split_val
end

#resolve_parent_split_pattern(mappings) ⇒ Object



207
208
209
210
211
212
213
# File 'app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb', line 207

def resolve_parent_split_pattern(mappings)
  split_val = mappings.dig('parents', 'split') || mappings.dig(:parents, :split)
  return nil if split_val.blank?
  return Bulkrax::DEFAULT_MULTI_VALUE_ELEMENT_SPLIT_ON if split_val == true

  split_val
end

#resolve_relationship_column(mappings, flag, default) ⇒ Object

Returns the raw CSV column name (String) for a relationship field. Looks for the mapping entry flagged with flag and returns its first from value, falling back to default when none is found.



202
203
204
205
# File 'app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb', line 202

def resolve_relationship_column(mappings, flag, default)
  entry = mappings.find { |_k, v| v.is_a?(Hash) && v[flag] }
  entry&.last&.dig('from')&.first || default
end

#resolve_validation_key(mapping_manager, key: nil, flag: nil, default:) ⇒ Object

Resolve a symbol key from mappings for use as a record hash key. Returns a Symbol matching the parser’s symbol-keyed record hash.



11
12
13
14
# File 'app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb', line 11

def resolve_validation_key(mapping_manager, key: nil, flag: nil, default:)
  options = mapping_manager.resolve_column_name(key: key, flag: flag, default: default.to_s)
  options.first&.to_sym || default
end

#split_or_single(value, split_pattern) ⇒ Object



266
267
268
269
270
271
272
273
274
# File 'app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb', line 266

def split_or_single(value, split_pattern)
  if split_pattern
    value.to_s.split(split_pattern).map(&:strip).reject(&:blank?)
  elsif value.present?
    [value.to_s.strip]
  else
    []
  end
end

#suffixed_values(raw_row, suffix_pattern) ⇒ Object



276
277
278
279
# File 'app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb', line 276

def suffixed_values(raw_row, suffix_pattern)
  raw_row.select { |k, _| k.to_s.match?(suffix_pattern) }
         .values.map(&:to_s).map(&:strip).reject(&:blank?)
end