Class: Bulkrax::CsvEntry

Inherits:
Entry show all
Defined in:
app/models/bulkrax/csv_entry.rb

Overview

TODO: We need to rework this class some to address the Metrics/ClassLength rubocop offense. We do too much in these entry classes. We need to extract the common logic from the various entry models into a module that can be shared between them.

Direct Known Subclasses

CsvCollectionEntry, CsvFileSetEntry

Defined Under Namespace

Modules: AttributeBuilderMethod Classes: CsvPathError, CsvWrapper, MissingMetadata, RecordNotFound

Instance Attribute Summary

Attributes inherited from Entry

#all_attrs

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from Entry

#build, child_field, #exporter?, #fetch_field_mapping, #find_collection, #importer?, #last_run, parent_field, #source_identifier, #work_identifier

Methods included from HasLocalProcessing

#add_local

Methods included from StatusInfo

#current_status, #failed?, #last_error, #set_status_info, #skipped?, #status, #status_at, #succeeded?

Methods included from ExportBehavior

#build_for_exporter, #file_extension, #filename, #hyrax_record

Methods included from ImportBehavior

#active_id_for_authority?, #add_admin_set_id, #add_collections, #add_rights_statement, #add_user_to_permission_templates!, #add_visibility, #build_for_importer, #child_jobs, #factory, #factory_class, #override_rights_statement, #parent_jobs, #rights_statement, #sanitize_controlled_uri_value, #sanitize_controlled_uri_values!, #validate_value

Methods included from HasMatchers

#add_metadata, #excluded?, #field_supported?, #field_to, #fields_that_are_always_multiple, #fields_that_are_always_singular, #get_object_name, #matched_metadata, #multiple?, #multiple_metadata, #schema_form_definitions, #set_parsed_data, #set_parsed_object_data, #single_metadata, #supported_bulkrax_fields

Class Method Details

.data_for_entry(data, _source_id, parser) ⇒ Object



85
86
87
88
89
90
91
92
93
94
95
96
# File 'app/models/bulkrax/csv_entry.rb', line 85

def self.data_for_entry(data, _source_id, parser)
  # If a multi-line CSV data is passed, grab the first row
  data = data.first if data.is_a?(CSV::Table)
  # model has to be separated so that it doesn't get mistranslated by to_h
  raw_data = data.to_h
  raw_data[:model] = data[:model] if data[:model].present?
  # If the parents/children field mapping uses a custom column name, alias it to the standard key
  # so downstream code can find it regardless of what the CSV column is named.
  raw_data[:parents] = raw_data[parser.related_parents_raw_mapping.to_sym] if parser.related_parents_raw_mapping.present? && raw_data.key?(parser.related_parents_raw_mapping.to_sym) && parser.related_parents_raw_mapping != 'parents'
  raw_data[:children] = raw_data[parser.related_children_raw_mapping.to_sym] if parser.related_children_raw_mapping.present? && raw_data.key?(parser.related_children_raw_mapping.to_sym) && parser.related_children_raw_mapping != 'children'
  return raw_data
end

.fields_from_data(data) ⇒ Object



32
33
34
# File 'app/models/bulkrax/csv_entry.rb', line 32

def self.fields_from_data(data)
  data.headers.flatten.compact.uniq
end

.matcher_classObject



384
385
386
# File 'app/models/bulkrax/csv_entry.rb', line 384

def self.matcher_class
  Bulkrax::CsvMatcher
end

.read_data(path) ⇒ Object

there’s a risk that this reads the whole file into memory and could cause a memory leak we strip any special characters out of the headers. looking at you Excel

Raises:



40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# File 'app/models/bulkrax/csv_entry.rb', line 40

def self.read_data(path)
  raise CsvPathError, 'CSV path empty' if path.blank?
  options = {
    headers: true,
    header_converters: ->(h) { h.to_s.gsub(/[^\w\d\. -]+/, '').strip.to_sym },
    encoding: 'utf-8'
  }.merge(csv_read_data_options)

  results = if path.respond_to?(:read)
              path.rewind if path.respond_to?(:rewind)
              CSV.parse(path.read, **options)
            else
              CSV.read(path, **options)
            end
  csv_wrapper_class.new(results)
end

Instance Method Details

#add_fileObject



165
166
167
168
169
170
171
172
173
174
175
176
177
# File 'app/models/bulkrax/csv_entry.rb', line 165

def add_file
  self.['file'] ||= []
  if record['file']&.is_a?(String)
    self.['file'] = record['file'].split(Bulkrax.multi_value_element_split_on)
  elsif record['file'].is_a?(Array)
    self.['file'] = record['file']
  end
  self.['file'] = self.['file'].map do |f|
    next if f.blank?

    path_to_file(f.tr(' ', '_'))
  end.compact
end

#add_identifierObject



132
133
134
# File 'app/models/bulkrax/csv_entry.rb', line 132

def add_identifier
  self.[work_identifier] = [record[source_identifier]]
end

#add_ingested_metadataObject



156
157
158
159
160
161
162
163
# File 'app/models/bulkrax/csv_entry.rb', line 156

def 
  # we do not want to sort the values in the record before adding the metadata.
  # if we do, the factory_class will be set to the default_work_type for all values that come before "model" or "work type"
  record.each do |key, value|
    index = key[/\d+/].to_i - 1 if key[/\d+/].to_i != 0
    (key_without_numbers(key), value, index)
  end
end

#add_metadata_for_modelObject



142
143
144
145
146
147
148
149
150
151
152
153
154
# File 'app/models/bulkrax/csv_entry.rb', line 142

def 
  if factory_class.present? && factory_class == Bulkrax.collection_model_class
    add_collection_type_gid if defined?(::Hyrax)
    # add any additional collection metadata methods here
  elsif factory_class == Bulkrax.file_model_class
    validate_presence_of_filename!
    add_path_to_file
    validate_presence_of_parent!
  else
    add_file unless importerexporter.
    add_admin_set_id
  end
end

#build_export_metadataObject



179
180
181
182
183
184
185
186
187
188
189
# File 'app/models/bulkrax/csv_entry.rb', line 179

def 
  self. = {}

  
   if Bulkrax.collection_model_class.present? && !hyrax_record.is_a?(Bulkrax.collection_model_class)
  
  
  self.save!

  self.
end

#build_files_metadataObject



203
204
205
206
207
208
209
210
211
212
213
214
# File 'app/models/bulkrax/csv_entry.rb', line 203

def 
  # attaching files to the FileSet row only so we don't have duplicates when importing to a new tenant
  if hyrax_record.work?
    build_thumbnail_files
  else
    file_mapping = key_for_export('file')
    file_sets = hyrax_record.file_set? ? Array.wrap(hyrax_record) : hyrax_record.file_sets
    filenames = map_file_sets(file_sets)

    handle_join_on_export(file_mapping, filenames, mapping['file']&.[]('join')&.present?)
  end
end

#build_mapping_metadataObject



272
273
274
275
276
277
278
279
280
# File 'app/models/bulkrax/csv_entry.rb', line 272

def 
  mapping = fetch_field_mapping
  mapping.each do |key, value|
    method_name = AttributeBuilderMethod.for(key: key, value: value, entry: self)
    next unless method_name

    send(method_name, key, value)
  end
end

#build_metadataObject



98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# File 'app/models/bulkrax/csv_entry.rb', line 98

def 
  validate_record

  self. = {}
  add_identifier
  establish_factory_class
  
  # TODO(alishaevn): remove the collections stuff entirely and only reference collections via the new parents code
  add_collections
  add_visibility
  
  add_rights_statement
  sanitize_controlled_uri_values!
  add_local

  self.
end

#build_metadata_for_deleteObject

limited metadata is needed for delete jobs



117
118
119
120
121
122
# File 'app/models/bulkrax/csv_entry.rb', line 117

def 
  self. = {}
  establish_factory_class
  
  self.
end

#build_object(_key, value) ⇒ Object



282
283
284
285
286
287
288
289
290
# File 'app/models/bulkrax/csv_entry.rb', line 282

def build_object(_key, value)
  return unless hyrax_record.respond_to?(value['object'])

  data = hyrax_record.send(value['object'])
  return if data.empty?

  data = data.to_a if data.is_a?(ActiveTriples::Relation)
  (Array.wrap(data))
end

#build_relationship_metadataObject



216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
# File 'app/models/bulkrax/csv_entry.rb', line 216

def 
  # Includes all relationship methods for all exportable record types (works, Collections, FileSets)
  # @TODO: this logic assumes that the relationships are all available via a method that can be called
  #        on the object. With Valkyrie, this is only true for Hyrax-based models which include the
  #        ArResource module. We need to consider reworking this logic into an object factory method
  #        that can handle different types of models.
  relationship_methods = {
    related_parents_parsed_mapping => %i[member_of_collection_ids member_of_work_ids in_work_ids parent],
    related_children_parsed_mapping => %i[member_collection_ids member_work_ids file_set_ids member_ids]
  }

  relationship_methods.each do |relationship_key, methods|
    next if relationship_key.blank?

    values = []
    methods.each do |m|
      value = hyrax_record.public_send(m) if hyrax_record.respond_to?(m)
      value_id = value.try(:id)&.to_s || value # get the id if it's an object
      values << value_id if value_id.present?
    end
    values = values.flatten.uniq
    next if values.blank?

    handle_join_on_export(relationship_key, values, mapping[related_parents_parsed_mapping]['join'].present?)
  end
end

#build_system_metadataObject

Metadata required by Bulkrax for round-tripping



192
193
194
195
196
197
198
199
200
201
# File 'app/models/bulkrax/csv_entry.rb', line 192

def 
  self.['id'] = hyrax_record.id
  source_id = hyrax_record.send(work_identifier)
  # Because ActiveTriples::Relation does not respond to #to_ary we can't rely on Array.wrap universally
  source_id = source_id.to_a if source_id.is_a?(ActiveTriples::Relation)
  source_id = Array.wrap(source_id).first
  self.[source_identifier] = source_id
  model_name = Bulkrax.object_factory.model_name(resource: hyrax_record)
  self.[key_for_export('model')] = model_name
end

#build_thumbnail_filesObject



359
360
361
362
363
364
365
366
367
# File 'app/models/bulkrax/csv_entry.rb', line 359

def build_thumbnail_files
  return unless importerexporter.include_thumbnails
  thumbnail = Bulkrax.object_factory.thumbnail_for(resource: hyrax_record)
  return unless thumbnail

  filenames = map_file_sets(Array.wrap(thumbnail))
  thumbnail_mapping = 'thumbnail_file'
  handle_join_on_export(thumbnail_mapping, filenames, false)
end

#build_value(property_name, mapping_config) ⇒ Object



292
293
294
295
296
297
298
299
300
301
302
303
304
# File 'app/models/bulkrax/csv_entry.rb', line 292

def build_value(property_name, mapping_config)
  return unless hyrax_record.respond_to?(property_name.to_s)

  data = hyrax_record.send(property_name.to_s)

  if mapping_config['join'] || !data.is_a?(Enumerable)
    self.[key_for_export(property_name)] = prepare_export_data_with_join(data)
  else
    data.each_with_index do |d, i|
      self.["#{key_for_export(property_name)}_#{i + 1}"] = prepare_export_data(d)
    end
  end
end

#collection_identifiersObject



388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
# File 'app/models/bulkrax/csv_entry.rb', line 388

def collection_identifiers
  return @collection_identifiers if @collection_identifiers.present?

  parent_field_mapping = self.class.parent_field(parser)
  return [] unless parent_field_mapping.present? && record[parent_field_mapping].present?

  identifiers = []
  split_references = record[parent_field_mapping].split(Bulkrax.multi_value_element_split_on)
  split_references.each do |c_reference|
    matching_collection_entries = importerexporter.entries.select do |e|
      (e.&.[](source_identifier) == c_reference) &&
        e.is_a?(CsvCollectionEntry)
    end
    raise ::StandardError, 'Only expected to find one matching entry' if matching_collection_entries.count > 1
    identifiers << matching_collection_entries.first&.identifier
  end
  @collection_identifiers = identifiers.compact.presence || []
end

#collections_created?Boolean

Returns:

  • (Boolean)


407
408
409
410
# File 'app/models/bulkrax/csv_entry.rb', line 407

def collections_created?
  # TODO: look into if this method is still needed after new relationships code
  true
end

#establish_factory_classObject



136
137
138
139
140
# File 'app/models/bulkrax/csv_entry.rb', line 136

def establish_factory_class
  parser.model_field_mappings.each do |key|
    ('model', record[key]) if record.key?(key)
  end
end

#find_collection_idsObject



412
413
414
415
416
417
418
419
420
421
422
423
# File 'app/models/bulkrax/csv_entry.rb', line 412

def find_collection_ids
  return self.collection_ids if collections_created?
  if collection_identifiers.present?
    collection_identifiers.each do |collection_id|
      c = find_collection(collection_id)
      skip = c.blank? || self.collection_ids.include?(c.id)
      self.collection_ids << c.id unless skip
    end
  end

  self.collection_ids
end

#handle_join_on_export(key, values, join) ⇒ Object



369
370
371
372
373
374
375
376
377
378
# File 'app/models/bulkrax/csv_entry.rb', line 369

def handle_join_on_export(key, values, join)
  if join
    [key] = values.join(Bulkrax.multi_value_element_join_on)
  else
    values.each_with_index do |value, i|
      ["#{key}_#{i + 1}"] = value
    end
    .delete(key)
  end
end

#key_for_export(key) ⇒ Object

On export the key becomes the from and the from becomes the destination. It is the opposite of the import because we are moving data the opposite direction metadata that does not have a specific Bulkrax entry is mapped to the key name, as matching keys coming in are mapped by the csv parser automatically



308
309
310
311
312
313
# File 'app/models/bulkrax/csv_entry.rb', line 308

def key_for_export(key)
  clean_key = key_without_numbers(key)
  unnumbered_key = mapping[clean_key] ? mapping[clean_key]['from'].first : clean_key
  # Bring the number back if there is one
  "#{unnumbered_key}#{key.sub(clean_key, '')}"
end

#object_metadata(data) ⇒ Object



331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
# File 'app/models/bulkrax/csv_entry.rb', line 331

def (data)
  # NOTE: What is `d` in this case:
  #
  #  "[{\"single_object_first_name\"=>\"Fake\", \"single_object_last_name\"=>\"Fakerson\", \"single_object_position\"=>\"Leader, Jester, Queen\", \"single_object_language\"=>\"english\"}]"
  #
  # The above is a stringified version of a Ruby string.  Using eval is a very bad idea as it
  # will execute the value of `d` within the full Ruby interpreter context.
  #
  # TODO: Would it be possible to store this as a non-string?  Maybe the actual Ruby Array and Hash?
  data = data.map { |d| eval(d) }.flatten # rubocop:disable Security/Eval

  data.each_with_index do |obj, index|
    next if obj.nil?
    # allow the object_key to be valid whether it's a string or symbol
    obj = obj.with_indifferent_access

    obj.each_key do |key|
      if obj[key].is_a?(Array)
        obj[key].each_with_index do |_nested_item, nested_index|
          self.["#{key_for_export(key)}_#{index + 1}_#{nested_index + 1}"] = prepare_export_data(obj[key][nested_index])
        end
      else
        self.["#{key_for_export(key)}_#{index + 1}"] = prepare_export_data(obj[key])
      end
    end
  end
end

#path_to_file(file) ⇒ Object

If only filename is given, construct the path (/files/my_file). If file contains a path separator (e.g. attachments/cat_scan.jpg), resolve relative to the CSV’s directory.



427
428
429
430
431
432
433
434
435
436
437
438
439
440
# File 'app/models/bulkrax/csv_entry.rb', line 427

def path_to_file(file)
  return file if File.exist?(file)

  # Relative path: resolve from CSV's directory (allows arbitrary subdirectory names, not just "files")
  return resolve_relative_file_path(file) if file.include?('/')

  # Bare filename: use legacy files/ directory for backward compatibility and round-tripping
  path = importerexporter.parser.path_to_files
  raise "Could not determine path to files directory. Ensure the import package contains a zip or a valid import_file_path." if path.nil?

  f = File.join(path, file)
  return f if File.exist?(f)
  raise "File not found: #{f}. Check the file column in your CSV and ensure the file exists in the import package or path_to_files directory."
end

#prepare_export_data(datum) ⇒ Object



323
324
325
326
327
328
329
# File 'app/models/bulkrax/csv_entry.rb', line 323

def prepare_export_data(datum)
  if datum.is_a?(ActiveTriples::Resource)
    datum.to_uri.to_s
  else
    datum
  end
end

#prepare_export_data_with_join(data) ⇒ Object



315
316
317
318
319
320
321
# File 'app/models/bulkrax/csv_entry.rb', line 315

def prepare_export_data_with_join(data)
  # Yes...it's possible we're asking to coerce a multi-value but only have a single value.
  return data.to_s unless data.is_a?(Enumerable)
  return "" if data.empty?

  data.map { |d| prepare_export_data(d) }.join(Bulkrax.multi_value_element_join_on).to_s
end

#recordObject



380
381
382
# File 'app/models/bulkrax/csv_entry.rb', line 380

def record
  @record ||= 
end

#validate_recordObject

Raises:



124
125
126
127
128
129
130
# File 'app/models/bulkrax/csv_entry.rb', line 124

def validate_record
  raise RecordNotFound, 'Record not found' if record.nil?
  unless importerexporter.parser.required_elements?(record)
    raise MissingMetadata, "Missing required elements, missing element(s) are: "\
"#{importerexporter.parser.missing_elements(record).join(', ')}"
  end
end