Class: Bulkrax::CsvParser
- Inherits:
-
ApplicationParser
- Object
- ApplicationParser
- Bulkrax::CsvParser
- Includes:
- CsvTemplateGeneration, CsvValidation, ErroredEntries, ExportBehavior
- Defined in:
- app/parsers/bulkrax/csv_parser.rb,
app/parsers/concerns/bulkrax/csv_parser/csv_validation.rb,
app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb,
app/parsers/concerns/bulkrax/csv_parser/csv_template_generation.rb,
app/parsers/concerns/bulkrax/csv_parser/csv_validation_hierarchy.rb
Overview
rubocop:disable Metrics/ClassLength
Direct Known Subclasses
Defined Under Namespace
Modules: CsvTemplateGeneration, CsvValidation, CsvValidationHelpers, CsvValidationHierarchy
Instance Attribute Summary collapse
-
#collections ⇒ Object
rubocop:enabled Metrics/AbcSize.
- #file_sets ⇒ Object
-
#validation_mode ⇒ Object
Returns the value of attribute validation_mode.
- #works ⇒ Object
Attributes inherited from ApplicationParser
Class Method Summary collapse
Instance Method Summary collapse
-
#build_records ⇒ Object
rubocop:disable Metrics/AbcSize.
- #collection_entry_class ⇒ Object
- #collections_total ⇒ Object
- #create_new_entries ⇒ Object (also: #create_from_collection, #create_from_importer, #create_from_worktype, #create_from_all)
- #current_records_for_export ⇒ Object
- #entry_class ⇒ Object
-
#export_headers ⇒ Object
All possible column names.
- #export_key_allowed(key) ⇒ Object
-
#file_paths ⇒ Object
Retrieve file paths for [:file] mapping in records and check all listed files exist.
- #file_set_entry_class ⇒ Object
- #file_sets_total ⇒ Object
-
#import_fields ⇒ Object
We could use CsvEntry#fields_from_data(data) but that would mean re-reading the data.
- #missing_elements(record) ⇒ Object
- #object_names ⇒ Object
-
#path_to_files(**args) ⇒ Object
Retrieve the path where we expect to find the files for this import.
- #records(_opts = {}) ⇒ Object
- #records_split_count ⇒ Object
-
#remove_spaces_from_filenames ⇒ Object
File names referenced in CSVs have spaces replaced with underscores.
- #required_elements?(record) ⇒ Boolean
- #retrieve_cloud_files(files, importer) ⇒ Object
-
#setup_export_file(folder_count) ⇒ Object
in the parser as it is specific to the format.
- #sort_entries(entries) ⇒ Object
- #sort_headers(headers) ⇒ Object
- #store_files(identifier, folder_count) ⇒ Object
-
#total ⇒ Object
TODO: figure out why using the version of this method that’s in the bagit parser breaks specs for the “if importer?” line.
-
#unzip_attachments_only(file_to_unzip) ⇒ Object
Extracts a zip that accompanies a separately-uploaded CSV.
-
#unzip_with_primary_csv(file_to_unzip) ⇒ Object
Extracts a zip that contains a primary CSV.
- #valid_entry_types ⇒ Object
- #valid_import? ⇒ Boolean
- #works_total ⇒ Object
-
#write_files ⇒ Object
export methods.
- #write_partial_import_file(file) ⇒ Object
Methods included from ExportBehavior
#build_export_metadata, #build_for_exporter, #file_extension, #filename, #hyrax_record
Methods included from ErroredEntries
#build_errored_entry_row, #setup_errored_entries_file, #write_errored_entries_file
Methods inherited from ApplicationParser
#base_path, #calculate_type_delay, #copy_file, #create_collections, #create_entry_and_job, #create_file_sets, #create_objects, #create_relationships, #create_works, #exporter?, #extract_zip_entry, #find_or_create_entry, #generated_metadata_mapping, #get_field_mapping_hash_for, #import_file_path, import_supported?, #importer?, #initialize, #invalid_record, #limit_reached?, #macos_junk_entry?, #model_field_mappings, #new_entry, parser_fields, #path_for_import, #perform_method, #rebuild_entries, #rebuild_entry_query, #record, #record_deleted?, #record_has_source_identifier, #record_raw_metadata, #record_remove_and_rerun?, #reject_unsafe_entry!, #related_children_parsed_mapping, #related_children_raw_mapping, #related_parents_parsed_mapping, #related_parents_raw_mapping, #required_elements, #safe_extract_path, #source_identifier, #untar, #unzip, #visibility, #work_entry_class, #work_identifier, #work_identifier_search_field, #write, #write_import_file, #zip
Constructor Details
This class inherits a constructor from Bulkrax::ApplicationParser
Instance Attribute Details
#collections ⇒ Object
rubocop:enabled Metrics/AbcSize
64 65 66 67 |
# File 'app/parsers/bulkrax/csv_parser.rb', line 64 def collections build_records if @collections.nil? @collections end |
#file_sets ⇒ Object
74 75 76 77 |
# File 'app/parsers/bulkrax/csv_parser.rb', line 74 def file_sets build_records if @file_sets.nil? @file_sets end |
#validation_mode ⇒ Object
Returns the value of attribute validation_mode.
10 11 12 |
# File 'app/parsers/bulkrax/csv_parser.rb', line 10 def validation_mode @validation_mode end |
#works ⇒ Object
69 70 71 72 |
# File 'app/parsers/bulkrax/csv_parser.rb', line 69 def works build_records if @works.nil? @works end |
Class Method Details
.export_supported? ⇒ Boolean
12 13 14 |
# File 'app/parsers/bulkrax/csv_parser.rb', line 12 def self.export_supported? true end |
Instance Method Details
#build_records ⇒ Object
rubocop:disable Metrics/AbcSize
31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
# File 'app/parsers/bulkrax/csv_parser.rb', line 31 def build_records @collections = [] @works = [] @file_sets = [] if model_field_mappings.map { |mfm| mfm.to_sym.in?(records.first.keys) }.any? records.map do |r| model_field_mappings.map(&:to_sym).each do |model_mapping| next unless r.key?(model_mapping) model = r[model_mapping].nil? ? "" : r[model_mapping].strip # TODO: Eventually this should be refactored to us Hyrax.config.collection_model # We aren't right now because so many Bulkrax users are in between Fedora and Valkyrie if model.casecmp('collection').zero? || model.casecmp('collectionresource').zero? @collections << r elsif model.casecmp('fileset').zero? || model.casecmp('hyrax::fileset').zero? @file_sets << r else @works << r end end end @collections = @collections.flatten.compact.uniq @file_sets = @file_sets.flatten.compact.uniq @works = @works.flatten.compact.uniq else # if no model is specified, assume all records are works @works = records.flatten.compact.uniq end true end |
#collection_entry_class ⇒ Object
165 166 167 |
# File 'app/parsers/bulkrax/csv_parser.rb', line 165 def collection_entry_class CsvCollectionEntry end |
#collections_total ⇒ Object
79 80 81 |
# File 'app/parsers/bulkrax/csv_parser.rb', line 79 def collections_total collections.size end |
#create_new_entries ⇒ Object Also known as: create_from_collection, create_from_importer, create_from_worktype, create_from_all
143 144 145 146 147 148 149 150 151 152 153 154 155 |
# File 'app/parsers/bulkrax/csv_parser.rb', line 143 def create_new_entries # NOTE: The each method enforces the limit, as it can best optimize the underlying queries. current_records_for_export.each do |id, entry_class| new_entry = find_or_create_entry(entry_class, id, 'Bulkrax::Exporter') begin entry = ExportWorkJob.perform_now(new_entry.id, current_run.id) rescue => e Rails.logger.info("#{e.} was detected during export") end self.headers |= entry..keys if entry end end |
#current_records_for_export ⇒ Object
136 137 138 139 140 141 |
# File 'app/parsers/bulkrax/csv_parser.rb', line 136 def current_records_for_export @current_records_for_export ||= Bulkrax::ParserExportRecordSet.for( parser: self, export_from: importerexporter.export_from ) end |
#entry_class ⇒ Object
161 162 163 |
# File 'app/parsers/bulkrax/csv_parser.rb', line 161 def entry_class CsvEntry end |
#export_headers ⇒ Object
All possible column names
289 290 291 292 293 294 295 296 297 298 299 300 301 |
# File 'app/parsers/bulkrax/csv_parser.rb', line 289 def export_headers headers = sort_headers(self.headers) # we don't want access_control_id exported and we want file at the end headers.delete('access_control_id') if headers.include?('access_control_id') # add the headers below at the beginning or end to maintain the preexisting export behavior headers.prepend('model') headers.prepend(source_identifier.to_s) headers.prepend('id') headers.uniq end |
#export_key_allowed(key) ⇒ Object
283 284 285 286 |
# File 'app/parsers/bulkrax/csv_parser.rb', line 283 def export_key_allowed(key) new_entry(entry_class, 'Bulkrax::Exporter').field_supported?(key) && key != source_identifier.to_s end |
#file_paths ⇒ Object
Retrieve file paths for [:file] mapping in records
and check all listed files exist.
351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 |
# File 'app/parsers/bulkrax/csv_parser.rb', line 351 def file_paths raise StandardError, 'No records were found' if records.blank? return [] if importerexporter. @file_paths ||= records.map do |r| file_mapping = Bulkrax.field_mappings.dig(self.class.to_s, 'file', :from)&.first&.to_sym || :file next if r[file_mapping].blank? split_value = Bulkrax.field_mappings.dig(self.class.to_s, :file, :split) split_pattern = case split_value when Regexp split_value when String Regexp.new(split_value) else Bulkrax.multi_value_element_split_on end files_dir = path_to_files raise StandardError, "Record references local files but no files directory could be resolved from the import path" if files_dir.nil? r[file_mapping].split(split_pattern).map do |f| file = File.join(files_dir, f.strip.tr(' ', '_')) if File.exist?(file) # rubocop:disable Style/GuardClause file else raise "File #{file} does not exist" end end end.flatten.compact.uniq end |
#file_set_entry_class ⇒ Object
169 170 171 |
# File 'app/parsers/bulkrax/csv_parser.rb', line 169 def file_set_entry_class CsvFileSetEntry end |
#file_sets_total ⇒ Object
87 88 89 |
# File 'app/parsers/bulkrax/csv_parser.rb', line 87 def file_sets_total file_sets.size end |
#import_fields ⇒ Object
We could use CsvEntry#fields_from_data(data) but that would mean re-reading the data
92 93 94 |
# File 'app/parsers/bulkrax/csv_parser.rb', line 92 def import_fields @import_fields ||= records.inject(:merge).keys.compact.uniq end |
#missing_elements(record) ⇒ Object
100 101 102 103 104 105 106 107 108 109 110 111 |
# File 'app/parsers/bulkrax/csv_parser.rb', line 100 def missing_elements(record) keys_from_record = keys_without_numbers(record.reject { |_, v| v.blank? }.keys.compact.uniq.map(&:to_s)) keys = [] mapping_values = importerexporter.mapping.stringify_keys mapping_values.each do |k, v| from_values = Array.wrap(v.is_a?(Hash) ? (v['from'] || v[:from]) : nil) from_values.each do |vf| keys << k if vf.present? && keys_from_record.include?(vf.to_s.strip) end end required_elements.map(&:to_s) - keys.uniq.map(&:to_s) end |
#object_names ⇒ Object
303 304 305 306 307 308 309 310 |
# File 'app/parsers/bulkrax/csv_parser.rb', line 303 def object_names return @object_names if @object_names @object_names = mapping.values.map { |value| value['object'] } @object_names.uniq!&.delete(nil) @object_names end |
#path_to_files(**args) ⇒ Object
Retrieve the path where we expect to find the files for this import. After ImporterJob#unzip_imported_file runs (zip cases), attachments live under ‘importer_unzip_path/files/`. For a server-path-style import (the user specified a CSV file path with a sibling `files/` directory on disk), resolve relative to the CSV’s directory instead.
When called with ‘filename:`, returns the full path to that file if it exists on disk, or `nil` otherwise — callers like `Bulkrax::FileSetEntryBehavior#add_path_to_file` rely on the nil sentinel to fall back to the raw filename in their error messages.
When called with no filename, returns the ‘files/` directory itself (only when that directory exists on disk — else `nil` so callers can raise a clear “no files directory” error).
396 397 398 399 400 401 402 403 404 |
# File 'app/parsers/bulkrax/csv_parser.rb', line 396 def path_to_files(**args) filename = args.fetch(:filename, '') base_dir = files_dir return base_dir if filename.blank? && Dir.exist?(base_dir) return nil if filename.blank? candidate = File.join(base_dir, filename) candidate if File.exist?(candidate) end |
#records(_opts = {}) ⇒ Object
16 17 18 19 20 21 22 23 24 25 26 27 28 |
# File 'app/parsers/bulkrax/csv_parser.rb', line 16 def records(_opts = {}) return @records if @records.present? file_for_import = only_updates ? parser_fields['partial_import_file_path'] : import_file_path csv_data = entry_class.read_data(file_for_import) unless validation_mode importer.parser_fields['total'] = csv_data.count importer.save end @records = csv_data.map { |record_data| entry_class.data_for_entry(record_data, nil, self) } @records end |
#records_split_count ⇒ Object
194 195 196 |
# File 'app/parsers/bulkrax/csv_parser.rb', line 194 def records_split_count 1000 end |
#remove_spaces_from_filenames ⇒ Object
File names referenced in CSVs have spaces replaced with underscores.
459 460 461 462 463 464 465 466 467 468 469 |
# File 'app/parsers/bulkrax/csv_parser.rb', line 459 def remove_spaces_from_filenames files = Dir.glob(File.join(importer_unzip_path, 'files', '*')) files_with_spaces = files.select { |f| f.split('/').last.include?(' ') } return if files_with_spaces.blank? files_with_spaces.map! { |path| Pathname.new(path) } files_with_spaces.each do |path| filename_without_spaces = path.basename.to_s.tr(' ', '_') path.rename(File.join(path.dirname, filename_without_spaces)) end end |
#required_elements?(record) ⇒ Boolean
96 97 98 |
# File 'app/parsers/bulkrax/csv_parser.rb', line 96 def required_elements?(record) missing_elements(record).blank? end |
#retrieve_cloud_files(files, importer) ⇒ Object
-
investigate getting directory structure
-
investigate using perform_later, and having the importer check for
DownloadCloudFileJob before it starts
201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 |
# File 'app/parsers/bulkrax/csv_parser.rb', line 201 def retrieve_cloud_files(files, importer) files_path = File.join(path_for_import, 'files') FileUtils.mkdir_p(files_path) unless File.exist?(files_path) target_files = [] files.each_pair do |_key, file| # fixes bug where auth headers do not get attached properly if file['auth_header'].present? file['headers'] ||= {} file['headers'].merge!(file['auth_header']) end # this only works for uniquely named files target_file = File.join(files_path, file['file_name'].tr(' ', '_')) target_files << target_file # Now because we want the files in place before the importer runs # Problematic for a large upload Bulkrax::DownloadCloudFileJob.perform_later(file, target_file) end importer[:parser_fields]['original_file_paths'] = target_files return nil end |
#setup_export_file(folder_count) ⇒ Object
in the parser as it is specific to the format
342 343 344 345 346 347 |
# File 'app/parsers/bulkrax/csv_parser.rb', line 342 def setup_export_file(folder_count) path = File.join(importerexporter.exporter_export_path, folder_count.to_s) FileUtils.mkdir_p(path) unless File.exist?(path) File.join(path, "export_#{importerexporter.export_source}_from_#{importerexporter.export_from}_#{folder_count}.csv") end |
#sort_entries(entries) ⇒ Object
312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 |
# File 'app/parsers/bulkrax/csv_parser.rb', line 312 def sort_entries(entries) # always export models in the same order: work, collection, file set # # TODO: This is a problem in that only these classes are compared. Instead # We should add a comparison operator to the classes. entries.sort_by do |entry| case entry.type when 'Bulkrax::CsvCollectionEntry' '1' when 'Bulkrax::CsvFileSetEntry' '2' else '0' end end end |
#sort_headers(headers) ⇒ Object
329 330 331 332 333 334 335 336 337 338 339 |
# File 'app/parsers/bulkrax/csv_parser.rb', line 329 def sort_headers(headers) # converting headers like creator_name_1 to creator_1_name so they get sorted by numerical order # while keeping objects grouped together headers.sort_by do |item| number = item.match(/\d+/)&.[](0) || 0.to_s sort_number = number.rjust(4, "0") object_prefix = object_names.detect { |o| item.match(/^#{o}/) } || item remainder = item.gsub(/^#{object_prefix}_/, '').gsub(/_#{number}/, '') "#{object_prefix}_#{sort_number}_#{remainder}" end end |
#store_files(identifier, folder_count) ⇒ Object
248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 |
# File 'app/parsers/bulkrax/csv_parser.rb', line 248 def store_files(identifier, folder_count) record = Bulkrax.object_factory.find(identifier) return unless record file_sets = Array.wrap(record) if record.file_set? if file_sets.nil? # for valkyrie file_sets = record.respond_to?(:file_sets) ? record.file_sets : record.members&.select(&:file_set?) end if importerexporter.include_thumbnails? thumbnail = Bulkrax.object_factory.thumbnail_for(resource: record) file_sets << thumbnail if thumbnail.present? end file_sets.each do |fs| path = File.join(exporter_export_path, folder_count, 'files') FileUtils.mkdir_p(path) unless File.exist? path original_file = Bulkrax.object_factory.original_file(fileset: fs) next if original_file.blank? file = filename(fs) io = original_file.respond_to?(:uri) ? open(original_file.uri) : original_file.file.io File.open(File.join(path, file), 'wb') do |f| f.write(io.read) f.close end end rescue Ldp::Gone return rescue StandardError => e raise StandardError, "Unable to retrieve files for identifier #{identifier} - #{e.}" end |
#total ⇒ Object
TODO: figure out why using the version of this method that’s in the bagit parser breaks specs for the “if importer?” line
179 180 181 182 183 184 185 186 187 188 189 190 191 192 |
# File 'app/parsers/bulkrax/csv_parser.rb', line 179 def total @total = if importer? importer.parser_fields['total'] || 0 elsif exporter? limit.to_i.zero? ? current_records_for_export.count : limit.to_i else 0 end return @total rescue StandardError @total = 0 end |
#unzip_attachments_only(file_to_unzip) ⇒ Object
Extracts a zip that accompanies a separately-uploaded CSV. Every entry lands under #importer_unzip_path/files/ — including any CSVs inside the zip, which are treated as attachments since the primary CSV was uploaded outside the zip. Strips a single top-level wrapper directory if present, so users can zip either the contents or the enclosing folder.
443 444 445 446 447 448 449 450 451 452 453 454 455 |
# File 'app/parsers/bulkrax/csv_parser.rb', line 443 def (file_to_unzip) dest_dir = importer_unzip_path(mkdir: true) Zip::File.open(file_to_unzip) do |zip_file| entries = real_zip_entries(zip_file) wrapper = single_top_level_wrapper(entries) entries.each do |entry| relative = wrapper ? entry.name.delete_prefix("#{wrapper}/") : entry.name next if relative.empty? extract_to(zip_file, entry, dest_dir, File.join('files', relative)) end end end |
#unzip_with_primary_csv(file_to_unzip) ⇒ Object
Extracts a zip that contains a primary CSV. The primary CSV lands at the root of #importer_unzip_path; every other entry lands under #importer_unzip_path/files/, preserving its path relative to the primary CSV’s directory.
Primary-CSV selection matches the guided-import validator’s rule (see ImporterFileHandler#locate_csv_entry_in_zip): the CSV entry at the shallowest directory level. Visible errors are raised on zero CSVs or multiple CSVs at the shallowest level.
418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 |
# File 'app/parsers/bulkrax/csv_parser.rb', line 418 def unzip_with_primary_csv(file_to_unzip) dest_dir = importer_unzip_path(mkdir: true) Zip::File.open(file_to_unzip) do |zip_file| entries = real_zip_entries(zip_file) primary = select_primary_csv!(entries) primary_dir = File.dirname(primary.name) entries.each do |entry| if entry == primary extract_to(zip_file, entry, dest_dir, File.basename(entry.name)) else extract_to(zip_file, entry, dest_dir, File.join('files', relative_to(primary_dir, entry.name))) end end end end |
#valid_entry_types ⇒ Object
173 174 175 |
# File 'app/parsers/bulkrax/csv_parser.rb', line 173 def valid_entry_types [collection_entry_class.to_s, file_set_entry_class.to_s, entry_class.to_s] end |
#valid_import? ⇒ Boolean
113 114 115 116 117 118 119 120 121 122 |
# File 'app/parsers/bulkrax/csv_parser.rb', line 113 def valid_import? compressed_record = records.flat_map(&:to_a).partition { |_, v| !v }.flatten(1).to_h error_alert = "Missing at least one required element, missing element(s) are: #{missing_elements(compressed_record).join(', ')}" raise StandardError, error_alert unless required_elements?(compressed_record) file_paths.is_a?(Array) rescue StandardError => e set_status_info(e) false end |
#works_total ⇒ Object
83 84 85 |
# File 'app/parsers/bulkrax/csv_parser.rb', line 83 def works_total works.size end |
#write_files ⇒ Object
export methods
224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 |
# File 'app/parsers/bulkrax/csv_parser.rb', line 224 def write_files require 'open-uri' folder_count = 0 # TODO: This is not performant as well; unclear how to address, but lower priority as of # <2023-02-21 Tue>. sorted_entries = sort_entries(importerexporter.entries.uniq(&:identifier)) .select { |e| valid_entry_types.include?(e.type) } group_size = limit.to_i.zero? ? total : limit.to_i sorted_entries[0..group_size].in_groups_of(records_split_count, false) do |group| folder_count += 1 CSV.open(setup_export_file(folder_count), "w", headers: export_headers, write_headers: true) do |csv| group.each do |entry| csv << entry. # TODO: This is precarious when we have descendents of Bulkrax::CsvCollectionEntry next if importerexporter. || entry.type == 'Bulkrax::CsvCollectionEntry' store_files(entry.identifier, folder_count.to_s) end end end end |
#write_partial_import_file(file) ⇒ Object
124 125 126 127 128 129 130 131 132 133 134 |
# File 'app/parsers/bulkrax/csv_parser.rb', line 124 def write_partial_import_file(file) import_filename = import_file_path.split('/').last partial_import_filename = "#{File.basename(import_filename, '.csv')}_corrected_entries.csv" path = File.join(path_for_import, partial_import_filename) FileUtils.mv( file.path, path ) path end |