Class: Bulkrax::BagitParser

Inherits:
CsvParser show all
Includes:
ExportBehavior
Defined in:
app/parsers/bulkrax/bagit_parser.rb

Overview

rubocop:disable Metrics/ClassLength

Instance Attribute Summary

Attributes inherited from CsvParser

#collections, #file_sets, #validation_mode, #works

Attributes inherited from ApplicationParser

#headers, #importerexporter

Class Method Summary collapse

Instance Method Summary collapse

Methods included from ExportBehavior

#build_export_metadata, #build_for_exporter, #file_extension, #filename, #hyrax_record

Methods inherited from CsvParser

#build_records, #collection_entry_class, #collections_total, #create_new_entries, #current_records_for_export, #export_headers, #export_key_allowed, #file_paths, #file_set_entry_class, #file_sets_total, #missing_elements, #object_names, #records_split_count, #remove_spaces_from_filenames, #required_elements?, #setup_export_file, #sort_entries, #sort_headers, #store_files, #total, #valid_entry_types, #works_total, #write_partial_import_file

Methods included from ErroredEntries

#build_errored_entry_row, #setup_errored_entries_file, #write_errored_entries_file

Methods inherited from ApplicationParser

#base_path, #calculate_type_delay, #collection_entry_class, #collections_total, #copy_file, #create_collections, #create_entry_and_job, #create_file_sets, #create_objects, #create_relationships, #create_works, #exporter?, #extract_zip_entry, #file_set_entry_class, #file_sets_total, #find_or_create_entry, #generated_metadata_mapping, #get_field_mapping_hash_for, #import_file_path, import_supported?, #importer?, #initialize, #invalid_record, #limit_reached?, #macos_junk_entry?, #model_field_mappings, #new_entry, parser_fields, #path_for_import, #perform_method, #rebuild_entries, #rebuild_entry_query, #record, #record_deleted?, #record_has_source_identifier, #record_raw_metadata, #record_remove_and_rerun?, #reject_unsafe_entry!, #related_children_parsed_mapping, #related_children_raw_mapping, #related_parents_parsed_mapping, #related_parents_raw_mapping, #required_elements, #safe_extract_path, #setup_export_file, #source_identifier, #total, #untar, #unzip, #visibility, #work_entry_class, #work_identifier, #work_identifier_search_field, #write, #write_import_file, #zip

Constructor Details

This class inherits a constructor from Bulkrax::ApplicationParser

Class Method Details

.export_supported?Boolean

Returns:

  • (Boolean)


8
9
10
# File 'app/parsers/bulkrax/bagit_parser.rb', line 8

def self.export_supported?
  true
end

Instance Method Details

#entry_classObject



19
20
21
22
# File 'app/parsers/bulkrax/bagit_parser.rb', line 19

def entry_class
  rdf_format = parser_fields&.[]('metadata_format') == "Bulkrax::RdfEntry"
  rdf_format ? RdfEntry : CsvEntry
end

#get_data(bag, data) ⇒ Object



61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# File 'app/parsers/bulkrax/bagit_parser.rb', line 61

def get_data(bag, data)
  if entry_class == CsvEntry
    data = data.map do |data_row|
      record_data = entry_class.data_for_entry(data_row, source_identifier, self)
      next record_data if importerexporter.

      record_data[:file] = bag.bag_files.join('|') if Bulkrax.curation_concerns.include? record_data[:model]&.constantize
      record_data
    end
  else
    data = entry_class.data_for_entry(data, source_identifier, self)
    data[:file] = bag.bag_files.join('|') unless importerexporter.
  end

  data
end

#import_fieldsObject

Take a random sample of 10 metadata_paths and work out the import fields from that

Raises:

  • (StandardError)


41
42
43
44
45
46
# File 'app/parsers/bulkrax/bagit_parser.rb', line 41

def import_fields
  raise StandardError, 'No metadata files were found' if .blank?
  @import_fields ||= .sample(10).map do |path|
    entry_class.fields_from_data(entry_class.read_data(path))
  end.flatten.compact.uniq
end

#key_allowed(key) ⇒ Object



146
147
148
149
150
# File 'app/parsers/bulkrax/bagit_parser.rb', line 146

def key_allowed(key)
  !Bulkrax.reserved_properties.include?(key) &&
    new_entry(entry_class, 'Bulkrax::Exporter').field_supported?(key) &&
    key != source_identifier.to_s
end

#path_to_files(filename:) ⇒ Object



24
25
26
# File 'app/parsers/bulkrax/bagit_parser.rb', line 24

def path_to_files(filename:)
  @path_to_files ||= Dir.glob(File.join(import_file_path, '**/data', filename)).first
end

#records(_opts = {}) ⇒ Object

Create an Array of all metadata records

Raises:

  • (StandardError)


49
50
51
52
53
54
55
56
57
58
59
# File 'app/parsers/bulkrax/bagit_parser.rb', line 49

def records(_opts = {})
  raise StandardError, 'No BagIt records were found' if bags.blank?
  @records ||= bags.map do |bag|
    path = (bag)
    raise StandardError, 'No metadata files were found' if path.blank?
    data = entry_class.read_data(path)
    get_data(bag, data)
  end

  @records = @records.flatten
end

#retrieve_cloud_files(files, _importer) ⇒ Object

TODO:
  • investigate getting directory structure

TODO:
  • investigate using perform_later, and having the importer check for

DownloadCloudFileJob before it starts



181
182
183
184
185
186
187
188
# File 'app/parsers/bulkrax/bagit_parser.rb', line 181

def retrieve_cloud_files(files, _importer)
  # There should only be one zip file for Bagit, take the first
  return if files['0'].blank?
  target_file = File.join(path_for_import, files['0']['file_name'].tr(' ', '_'))
  # Now because we want the files in place before the importer runs
  Bulkrax::DownloadCloudFileJob.perform_now(files['0'], target_file)
  return target_file
end

#setup_bagit_folder(folder_count, id) ⇒ Object



159
160
161
162
163
164
# File 'app/parsers/bulkrax/bagit_parser.rb', line 159

def setup_bagit_folder(folder_count, id)
  path = File.join(importerexporter.exporter_export_path, folder_count.to_s)
  FileUtils.mkdir_p(path) unless File.exist?(path)

  File.join(path, id)
end

#setup_csv_metadata_export_file(folder_count, id) ⇒ Object

rubocop:enable Metrics/MethodLength, Metrics/AbcSize



139
140
141
142
143
144
# File 'app/parsers/bulkrax/bagit_parser.rb', line 139

def (folder_count, id)
  path = File.join(importerexporter.exporter_export_path, folder_count.to_s)
  FileUtils.mkdir_p(path) unless File.exist?(path)

  File.join(path, id, 'metadata.csv')
end

#setup_triple_metadata_export_file(folder_count, id) ⇒ Object



152
153
154
155
156
157
# File 'app/parsers/bulkrax/bagit_parser.rb', line 152

def (folder_count, id)
  path = File.join(importerexporter.exporter_export_path, folder_count.to_s)
  FileUtils.mkdir_p(path) unless File.exist?(path)

  File.join(path, id, 'metadata.nt')
end

#unzip_attachments_only(file_to_unzip) ⇒ Object



36
37
38
# File 'app/parsers/bulkrax/bagit_parser.rb', line 36

def unzip_attachments_only(file_to_unzip)
  unzip(file_to_unzip)
end

#unzip_with_primary_csv(file_to_unzip) ⇒ Object

BagIt archives are not CSV imports: they don’t contain a primary CSV at a shallowest level, and their structure (bagit.txt + data/ + manifests) must be preserved verbatim. Override both CSV-flavored unzip entry points to use the base-class verbatim extraction.



32
33
34
# File 'app/parsers/bulkrax/bagit_parser.rb', line 32

def unzip_with_primary_csv(file_to_unzip)
  unzip(file_to_unzip)
end

#valid_import?Boolean

Returns:

  • (Boolean)


12
13
14
15
16
17
# File 'app/parsers/bulkrax/bagit_parser.rb', line 12

def valid_import?
  return true if import_fields.present?
rescue => e
  set_status_info(e)
  false
end

#write_filesObject

rubocop:disable Metrics/MethodLength, Metrics/AbcSize



81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# File 'app/parsers/bulkrax/bagit_parser.rb', line 81

def write_files
  require 'open-uri'
  require 'socket'

  folder_count = 1
  records_in_folder = 0
  work_entries = importerexporter.entries.where(type: work_entry_class.to_s)
  collection_entries = importerexporter.entries.where(type: collection_entry_class.to_s)
  file_set_entries = importerexporter.entries.where(type: file_set_entry_class.to_s)

  work_entries[0..limit || total].each do |entry|
    record = Bulkrax.object_factory.find(entry.identifier)
    next unless record

    bag_entries = [entry]

    if record.member_of_collection_ids.present?
      collection_entries.each { |ce| bag_entries << ce if ce..value?(record.id) }
    end

    if record.file_sets.present?
      file_set_entries.each { |fse| bag_entries << fse if fse..value?(record.id) }
    end

    records_in_folder += bag_entries.count
    if records_in_folder > records_split_count
      folder_count += 1
      records_in_folder = bag_entries.count
    end

    bag ||= BagIt::Bag.new setup_bagit_folder(folder_count, entry.identifier)

    record.file_sets.each do |fs|
      file_name = filename(fs)
      next if file_name.blank? || fs.original_file.blank?

      io = open(fs.original_file.uri)
      file = Tempfile.new([file_name, File.extname(file_name)], binmode: true)
      file.write(io.read)
      file.close
      begin
        bag.add_file(file_name, file.path) if bag.bag_files.select { |b| b.include?(file_name) }.blank?
      rescue => e
        entry.set_status_info(e)
        set_status_info(e)
      end
    end

    CSV.open((folder_count, entry.identifier), "w", headers: export_headers, write_headers: true) do |csv|
      bag_entries.each { |csv_entry| csv << csv_entry. }
    end

    write_triples(folder_count, entry)
    bag.manifest!(algo: 'sha256')
  end
end

#write_triples(folder_count, e) ⇒ Object

@todo(bjustice) - remove hyrax reference



167
168
169
170
171
172
173
174
175
176
# File 'app/parsers/bulkrax/bagit_parser.rb', line 167

def write_triples(folder_count, e)
  sd = SolrDocument.find(e.identifier)
  return if sd.nil?

  req = ActionDispatch::Request.new({ 'HTTP_HOST' => Socket.gethostname })
  rdf = Hyrax::GraphExporter.new(sd, req).fetch.dump(:ntriples)
  File.open((folder_count, e.identifier), "w") do |triples|
    triples.write(rdf)
  end
end