Class: Bulkrax::CsvParser

Inherits:

ApplicationParser

Object
ApplicationParser
Bulkrax::CsvParser

show all

Includes:: CsvTemplateGeneration, CsvValidation, ErroredEntries, ExportBehavior

Defined in:: app/parsers/bulkrax/csv_parser.rb,
app/parsers/concerns/bulkrax/csv_parser/csv_validation.rb,
app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb,
app/parsers/concerns/bulkrax/csv_parser/csv_template_generation.rb,
app/parsers/concerns/bulkrax/csv_parser/csv_validation_hierarchy.rb

Overview

rubocop:disable Metrics/ClassLength

Direct Known Subclasses

BagitParser

Defined Under Namespace

Modules: CsvTemplateGeneration, CsvValidation, CsvValidationHelpers, CsvValidationHierarchy

Instance Attribute Summary collapse

#collections ⇒ Object

rubocop:enabled Metrics/AbcSize.
#file_sets ⇒ Object
#validation_mode ⇒ Object

Returns the value of attribute validation_mode.
#works ⇒ Object

Attributes inherited from ApplicationParser

#headers, #importerexporter

Class Method Summary collapse

.export_supported? ⇒ Boolean

Instance Method Summary collapse

#build_records ⇒ Object

rubocop:disable Metrics/AbcSize.
#collection_entry_class ⇒ Object
#collections_total ⇒ Object
#create_new_entries ⇒ Object (also: #create_from_collection, #create_from_importer, #create_from_worktype, #create_from_all)
#current_records_for_export ⇒ Object
#entry_class ⇒ Object
#export_headers ⇒ Object

All possible column names.
#export_key_allowed(key) ⇒ Object
#file_paths ⇒ Object

Retrieve file paths for [:file] mapping in records and check all listed files exist.
#file_set_entry_class ⇒ Object
#file_sets_total ⇒ Object
#import_fields ⇒ Object

We could use CsvEntry#fields_from_data(data) but that would mean re-reading the data.
#missing_elements(record) ⇒ Object
#object_names ⇒ Object
#path_to_files(**args) ⇒ Object

Retrieve the path where we expect to find the files.
#records(_opts = {}) ⇒ Object
#records_split_count ⇒ Object
#required_elements?(record) ⇒ Boolean
#retrieve_cloud_files(files, importer) ⇒ Object
#setup_export_file(folder_count) ⇒ Object

in the parser as it is specific to the format.
#sort_entries(entries) ⇒ Object
#sort_headers(headers) ⇒ Object
#store_files(identifier, folder_count) ⇒ Object
#total ⇒ Object

TODO: figure out why using the version of this method that’s in the bagit parser breaks specs for the “if importer?” line.
#unzip(file_to_unzip) ⇒ Object
#valid_entry_types ⇒ Object
#valid_import? ⇒ Boolean
#works_total ⇒ Object
#write_files ⇒ Object

export methods.
#write_partial_import_file(file) ⇒ Object

Constructor Details

This class inherits a constructor from Bulkrax::ApplicationParser

Instance Attribute Details

#collections ⇒ `Object`

rubocop:enabled Metrics/AbcSize

# File 'app/parsers/bulkrax/csv_parser.rb', line 64

def collections
  build_records if @collections.nil?
  @collections
end

#file_sets ⇒ `Object`

# File 'app/parsers/bulkrax/csv_parser.rb', line 74

def file_sets
  build_records if @file_sets.nil?
  @file_sets
end

#validation_mode ⇒ `Object`

Returns the value of attribute validation_mode.



10
11
12

# File 'app/parsers/bulkrax/csv_parser.rb', line 10

def validation_mode
  @validation_mode
end

#works ⇒ `Object`

# File 'app/parsers/bulkrax/csv_parser.rb', line 69

def works
  build_records if @works.nil?
  @works
end

Class Method Details

.export_supported? ⇒ `Boolean`

Returns:

(Boolean)



12
13
14

# File 'app/parsers/bulkrax/csv_parser.rb', line 12

def self.export_supported?
  true
end

Instance Method Details

#build_records ⇒ `Object`

rubocop:disable Metrics/AbcSize

# File 'app/parsers/bulkrax/csv_parser.rb', line 31

def build_records
  @collections = []
  @works = []
  @file_sets = []

  if model_field_mappings.map { |mfm| mfm.to_sym.in?(records.first.keys) }.any?
    records.map do |r|
      model_field_mappings.map(&:to_sym).each do |model_mapping|
        next unless r.key?(model_mapping)

        model = r[model_mapping].nil? ? "" : r[model_mapping].strip
        # TODO: Eventually this should be refactored to us Hyrax.config.collection_model
        #       We aren't right now because so many Bulkrax users are in between Fedora and Valkyrie
        if model.casecmp('collection').zero? || model.casecmp('collectionresource').zero?
          @collections << r
        elsif model.casecmp('fileset').zero? || model.casecmp('hyrax::fileset').zero?
          @file_sets << r
        else
          @works << r
        end
      end
    end
    @collections = @collections.flatten.compact.uniq
    @file_sets = @file_sets.flatten.compact.uniq
    @works = @works.flatten.compact.uniq
  else # if no model is specified, assume all records are works
    @works = records.flatten.compact.uniq
  end

  true
end

#collection_entry_class ⇒ `Object`



165
166
167

# File 'app/parsers/bulkrax/csv_parser.rb', line 165

def collection_entry_class
  CsvCollectionEntry
end

#collections_total ⇒ `Object`



79
80
81

# File 'app/parsers/bulkrax/csv_parser.rb', line 79

def collections_total
  collections.size
end

#create_new_entries ⇒ `Object` Also known as: create_from_collection, create_from_importer, create_from_worktype, create_from_all

# File 'app/parsers/bulkrax/csv_parser.rb', line 143

def create_new_entries
  # NOTE: The each method enforces the limit, as it can best optimize the underlying queries.
  current_records_for_export.each do |id, entry_class|
    new_entry = find_or_create_entry(entry_class, id, 'Bulkrax::Exporter')
    begin
      entry = ExportWorkJob.perform_now(new_entry.id, current_run.id)
    rescue => e
      Rails.logger.info("#{e.message} was detected during export")
    end

    self.headers |= entry.parsed_metadata.keys if entry
  end
end

#current_records_for_export ⇒ `Object`

# File 'app/parsers/bulkrax/csv_parser.rb', line 136

def current_records_for_export
  @current_records_for_export ||= Bulkrax::ParserExportRecordSet.for(
    parser: self,
    export_from: importerexporter.export_from
  )
end

#entry_class ⇒ `Object`



161
162
163

# File 'app/parsers/bulkrax/csv_parser.rb', line 161

def entry_class
  CsvEntry
end

#export_headers ⇒ `Object`

All possible column names

# File 'app/parsers/bulkrax/csv_parser.rb', line 289

def export_headers
  headers = sort_headers(self.headers)

  # we don't want access_control_id exported and we want file at the end
  headers.delete('access_control_id') if headers.include?('access_control_id')

  # add the headers below at the beginning or end to maintain the preexisting export behavior
  headers.prepend('model')
  headers.prepend(source_identifier.to_s)
  headers.prepend('id')

  headers.uniq
end

#export_key_allowed(key) ⇒ `Object`

# File 'app/parsers/bulkrax/csv_parser.rb', line 283

def export_key_allowed(key)
  new_entry(entry_class, 'Bulkrax::Exporter').field_supported?(key) &&
    key != source_identifier.to_s
end

#file_paths ⇒ `Object`

Retrieve file paths for [:file] mapping in records

and check all listed files exist.

Raises:

(StandardError)

# File 'app/parsers/bulkrax/csv_parser.rb', line 351

def file_paths
  raise StandardError, 'No records were found' if records.blank?
  return [] if importerexporter.metadata_only?

  @file_paths ||= records.map do |r|
    file_mapping = Bulkrax.field_mappings.dig(self.class.to_s, 'file', :from)&.first&.to_sym || :file
    next if r[file_mapping].blank?

    split_value = Bulkrax.field_mappings.dig(self.class.to_s, :file, :split)
    split_pattern = case split_value
                    when Regexp
                      split_value
                    when String
                      Regexp.new(split_value)
                    else
                      Bulkrax.multi_value_element_split_on
                    end
    files_dir = path_to_files
    raise StandardError, "Record references local files but no files directory could be resolved from the import path" if files_dir.nil?

    r[file_mapping].split(split_pattern).map do |f|
      file = File.join(files_dir, f.strip.tr(' ', '_'))
      if File.exist?(file) # rubocop:disable Style/GuardClause
        file
      else
        raise "File #{file} does not exist"
      end
    end
  end.flatten.compact.uniq
end

#file_set_entry_class ⇒ `Object`



169
170
171

# File 'app/parsers/bulkrax/csv_parser.rb', line 169

def file_set_entry_class
  CsvFileSetEntry
end

#file_sets_total ⇒ `Object`



87
88
89

# File 'app/parsers/bulkrax/csv_parser.rb', line 87

def file_sets_total
  file_sets.size
end

#import_fields ⇒ `Object`

We could use CsvEntry#fields_from_data(data) but that would mean re-reading the data



92
93
94

# File 'app/parsers/bulkrax/csv_parser.rb', line 92

def import_fields
  @import_fields ||= records.inject(:merge).keys.compact.uniq
end

#missing_elements(record) ⇒ `Object`

# File 'app/parsers/bulkrax/csv_parser.rb', line 100

def missing_elements(record)
  keys_from_record = keys_without_numbers(record.reject { |_, v| v.blank? }.keys.compact.uniq.map(&:to_s))
  keys = []
  mapping_values = importerexporter.mapping.stringify_keys
  mapping_values.each do |k, v|
    from_values = Array.wrap(v.is_a?(Hash) ? (v['from'] || v[:from]) : nil)
    from_values.each do |vf|
      keys << k if vf.present? && keys_from_record.include?(vf.to_s.strip)
    end
  end
  required_elements.map(&:to_s) - keys.uniq.map(&:to_s)
end

#object_names ⇒ `Object`

# File 'app/parsers/bulkrax/csv_parser.rb', line 303

def object_names
  return @object_names if @object_names

  @object_names = mapping.values.map { |value| value['object'] }
  @object_names.uniq!&.delete(nil)

  @object_names
end

#path_to_files(**args) ⇒ `Object`

Retrieve the path where we expect to find the files

# File 'app/parsers/bulkrax/csv_parser.rb', line 383

def path_to_files(**args)
  filename = args.fetch(:filename, '')

  return @path_to_files if @path_to_files.present? && filename.blank?
  # The zip file could be either the main import file, or a separate attachments zip file.
  # We want to check for both of those before we determine the path to the files.
  have_zip_file = zip? || (parser_fields['attachments_zip_path'] && zip_file?(parser_fields['attachments_zip_path']))
  @path_to_files = File.join(
      have_zip_file ? importer_unzip_path : File.dirname(import_file_path), 'files', filename
    )

  return @path_to_files if File.exist?(@path_to_files)

  # TODO: This method silently returns nil if there is no file & no zip file
  File.join(importer_unzip_path, 'files', filename) if file? && zip?
end

#records(_opts = {}) ⇒ `Object`

# File 'app/parsers/bulkrax/csv_parser.rb', line 16

def records(_opts = {})
  return @records if @records.present?

  file_for_import = only_updates ? parser_fields['partial_import_file_path'] : import_file_path
  csv_data = entry_class.read_data(file_for_import)
  unless validation_mode
    importer.parser_fields['total'] = csv_data.count
    importer.save
  end

  @records = csv_data.map { |record_data| entry_class.data_for_entry(record_data, nil, self) }
  @records
end

#records_split_count ⇒ `Object`



194
195
196

# File 'app/parsers/bulkrax/csv_parser.rb', line 194

def records_split_count
  1000
end

#required_elements?(record) ⇒ `Boolean`

Returns:

(Boolean)



96
97
98

# File 'app/parsers/bulkrax/csv_parser.rb', line 96

def required_elements?(record)
  missing_elements(record).blank?
end

#retrieve_cloud_files(files, importer) ⇒ `Object`

TODO:

investigate getting directory structure

TODO:

investigate using perform_later, and having the importer check for

DownloadCloudFileJob before it starts

# File 'app/parsers/bulkrax/csv_parser.rb', line 201

def retrieve_cloud_files(files, importer)
  files_path = File.join(path_for_import, 'files')
  FileUtils.mkdir_p(files_path) unless File.exist?(files_path)
  target_files = []
  files.each_pair do |_key, file|
    # fixes bug where auth headers do not get attached properly
    if file['auth_header'].present?
      file['headers'] ||= {}
      file['headers'].merge!(file['auth_header'])
    end
    # this only works for uniquely named files
    target_file = File.join(files_path, file['file_name'].tr(' ', '_'))
    target_files << target_file
    # Now because we want the files in place before the importer runs
    # Problematic for a large upload
    Bulkrax::DownloadCloudFileJob.perform_later(file, target_file)
  end
  importer[:parser_fields]['original_file_paths'] = target_files
  return nil
end

#setup_export_file(folder_count) ⇒ `Object`

in the parser as it is specific to the format

# File 'app/parsers/bulkrax/csv_parser.rb', line 342

def setup_export_file(folder_count)
  path = File.join(importerexporter.exporter_export_path, folder_count.to_s)
  FileUtils.mkdir_p(path) unless File.exist?(path)

  File.join(path, "export_#{importerexporter.export_source}_from_#{importerexporter.export_from}_#{folder_count}.csv")
end

#sort_entries(entries) ⇒ `Object`

# File 'app/parsers/bulkrax/csv_parser.rb', line 312

def sort_entries(entries)
  # always export models in the same order: work, collection, file set
  #
  # TODO: This is a problem in that only these classes are compared.  Instead
  #       We should add a comparison operator to the classes.
  entries.sort_by do |entry|
    case entry.type
    when 'Bulkrax::CsvCollectionEntry'
      '1'
    when 'Bulkrax::CsvFileSetEntry'
      '2'
    else
      '0'
    end
  end
end

#sort_headers(headers) ⇒ `Object`

# File 'app/parsers/bulkrax/csv_parser.rb', line 329

def sort_headers(headers)
  # converting headers like creator_name_1 to creator_1_name so they get sorted by numerical order
  # while keeping objects grouped together
  headers.sort_by do |item|
    number = item.match(/\d+/)&.[](0) || 0.to_s
    sort_number = number.rjust(4, "0")
    object_prefix = object_names.detect { |o| item.match(/^#{o}/) } || item
    remainder = item.gsub(/^#{object_prefix}_/, '').gsub(/_#{number}/, '')
    "#{object_prefix}_#{sort_number}_#{remainder}"
  end
end

#store_files(identifier, folder_count) ⇒ `Object`

# File 'app/parsers/bulkrax/csv_parser.rb', line 248

def store_files(identifier, folder_count)
  record = Bulkrax.object_factory.find(identifier)
  return unless record

  file_sets = Array.wrap(record) if record.file_set?
  if file_sets.nil? # for valkyrie
    file_sets = record.respond_to?(:file_sets) ? record.file_sets : record.members&.select(&:file_set?)
  end

  if importerexporter.include_thumbnails?
    thumbnail = Bulkrax.object_factory.thumbnail_for(resource: record)
    file_sets << thumbnail if thumbnail.present?
  end

  file_sets.each do |fs|
    path = File.join(exporter_export_path, folder_count, 'files')
    FileUtils.mkdir_p(path) unless File.exist? path

    original_file = Bulkrax.object_factory.original_file(fileset: fs)
    next if original_file.blank?
    file = filename(fs)

    io = original_file.respond_to?(:uri) ? open(original_file.uri) : original_file.file.io

    File.open(File.join(path, file), 'wb') do |f|
      f.write(io.read)
      f.close
    end
  end
rescue Ldp::Gone
  return
rescue StandardError => e
  raise StandardError, "Unable to retrieve files for identifier #{identifier} - #{e.message}"
end

#total ⇒ `Object`

TODO: figure out why using the version of this method that’s in the bagit parser breaks specs for the “if importer?” line

# File 'app/parsers/bulkrax/csv_parser.rb', line 179

def total
  @total =
    if importer?
      importer.parser_fields['total'] || 0
    elsif exporter?
      limit.to_i.zero? ? current_records_for_export.count : limit.to_i
    else
      0
    end

  return @total
rescue StandardError
  @total = 0
end

#unzip(file_to_unzip) ⇒ `Object`

# File 'app/parsers/bulkrax/csv_parser.rb', line 400

def unzip(file_to_unzip)
  super
  normalize_unzipped_files_structure(importer_unzip_path)
end

#valid_entry_types ⇒ `Object`



173
174
175

# File 'app/parsers/bulkrax/csv_parser.rb', line 173

def valid_entry_types
  [collection_entry_class.to_s, file_set_entry_class.to_s, entry_class.to_s]
end

#valid_import? ⇒ `Boolean`

Returns:

(Boolean)

# File 'app/parsers/bulkrax/csv_parser.rb', line 113

def valid_import?
  compressed_record = records.flat_map(&:to_a).partition { |_, v| !v }.flatten(1).to_h
  error_alert = "Missing at least one required element, missing element(s) are: #{missing_elements(compressed_record).join(', ')}"
  raise StandardError, error_alert unless required_elements?(compressed_record)

  file_paths.is_a?(Array)
rescue StandardError => e
  set_status_info(e)
  false
end

#works_total ⇒ `Object`



83
84
85

# File 'app/parsers/bulkrax/csv_parser.rb', line 83

def works_total
  works.size
end

#write_files ⇒ `Object`

export methods

# File 'app/parsers/bulkrax/csv_parser.rb', line 224

def write_files
  require 'open-uri'
  folder_count = 0
  # TODO: This is not performant as well; unclear how to address, but lower priority as of
  #       <2023-02-21 Tue>.
  sorted_entries = sort_entries(importerexporter.entries.uniq(&:identifier))
                   .select { |e| valid_entry_types.include?(e.type) }

  group_size = limit.to_i.zero? ? total : limit.to_i
  sorted_entries[0..group_size].in_groups_of(records_split_count, false) do |group|
    folder_count += 1

    CSV.open(setup_export_file(folder_count), "w", headers: export_headers, write_headers: true) do |csv|
      group.each do |entry|
        csv << entry.parsed_metadata
        # TODO: This is precarious when we have descendents of Bulkrax::CsvCollectionEntry
        next if importerexporter.metadata_only? || entry.type == 'Bulkrax::CsvCollectionEntry'

        store_files(entry.identifier, folder_count.to_s)
      end
    end
  end
end

#write_partial_import_file(file) ⇒ `Object`

# File 'app/parsers/bulkrax/csv_parser.rb', line 124

def write_partial_import_file(file)
  import_filename = import_file_path.split('/').last
  partial_import_filename = "#{File.basename(import_filename, '.csv')}_corrected_entries.csv"

  path = File.join(path_for_import, partial_import_filename)
  FileUtils.mv(
    file.path,
    path
  )
  path
end

Class: Bulkrax::CsvParser

Overview

Direct Known Subclasses

Defined Under Namespace

Instance Attribute Summary collapse

Attributes inherited from ApplicationParser

Class Method Summary collapse

Instance Method Summary collapse

Methods included from ExportBehavior

Methods included from ErroredEntries

Methods inherited from ApplicationParser

Constructor Details

Instance Attribute Details

#collections ⇒ Object

#file_sets ⇒ Object

#validation_mode ⇒ Object

#works ⇒ Object

Class Method Details

.export_supported? ⇒ Boolean

Instance Method Details

#build_records ⇒ Object

#collection_entry_class ⇒ Object

#collections_total ⇒ Object

#create_new_entries ⇒ Object Also known as: create_from_collection, create_from_importer, create_from_worktype, create_from_all

#current_records_for_export ⇒ Object

#entry_class ⇒ Object

#export_headers ⇒ Object

#export_key_allowed(key) ⇒ Object

#file_paths ⇒ Object

#file_set_entry_class ⇒ Object

#file_sets_total ⇒ Object

#import_fields ⇒ Object

#missing_elements(record) ⇒ Object

#object_names ⇒ Object

#path_to_files(**args) ⇒ Object

#records(_opts = {}) ⇒ Object

#records_split_count ⇒ Object

#required_elements?(record) ⇒ Boolean

#retrieve_cloud_files(files, importer) ⇒ Object

#setup_export_file(folder_count) ⇒ Object

#sort_entries(entries) ⇒ Object

#sort_headers(headers) ⇒ Object

#store_files(identifier, folder_count) ⇒ Object

#total ⇒ Object

#unzip(file_to_unzip) ⇒ Object

#valid_entry_types ⇒ Object

#valid_import? ⇒ Boolean

#works_total ⇒ Object

#write_files ⇒ Object

#write_partial_import_file(file) ⇒ Object

#collections ⇒ `Object`

#file_sets ⇒ `Object`

#validation_mode ⇒ `Object`

#works ⇒ `Object`

.export_supported? ⇒ `Boolean`

#build_records ⇒ `Object`

#collection_entry_class ⇒ `Object`

#collections_total ⇒ `Object`

#create_new_entries ⇒ `Object` Also known as: create_from_collection, create_from_importer, create_from_worktype, create_from_all

#current_records_for_export ⇒ `Object`

#entry_class ⇒ `Object`

#export_headers ⇒ `Object`

#export_key_allowed(key) ⇒ `Object`

#file_paths ⇒ `Object`

#file_set_entry_class ⇒ `Object`

#file_sets_total ⇒ `Object`

#import_fields ⇒ `Object`

#missing_elements(record) ⇒ `Object`

#object_names ⇒ `Object`

#path_to_files(**args) ⇒ `Object`

#records(_opts = {}) ⇒ `Object`

#records_split_count ⇒ `Object`

#required_elements?(record) ⇒ `Boolean`

#retrieve_cloud_files(files, importer) ⇒ `Object`

#setup_export_file(folder_count) ⇒ `Object`

#sort_entries(entries) ⇒ `Object`

#sort_headers(headers) ⇒ `Object`

#store_files(identifier, folder_count) ⇒ `Object`

#total ⇒ `Object`

#unzip(file_to_unzip) ⇒ `Object`

#valid_entry_types ⇒ `Object`

#valid_import? ⇒ `Boolean`

#works_total ⇒ `Object`

#write_files ⇒ `Object`

#write_partial_import_file(file) ⇒ `Object`