Class: Bulkrax::BagitParser

Inherits:

CsvParser

Object
ApplicationParser
CsvParser
Bulkrax::BagitParser

show all

Includes:: ExportBehavior

Defined in:: app/parsers/bulkrax/bagit_parser.rb

Overview

rubocop:disable Metrics/ClassLength

Instance Attribute Summary

Attributes inherited from CsvParser

#collections, #file_sets, #validation_mode, #works

Attributes inherited from ApplicationParser

#headers, #importerexporter

Class Method Summary collapse

.export_supported? ⇒ Boolean

Instance Method Summary collapse

#entry_class ⇒ Object
#get_data(bag, data) ⇒ Object
#import_fields ⇒ Object

Take a random sample of 10 metadata_paths and work out the import fields from that.
#key_allowed(key) ⇒ Object
#path_to_files(filename:) ⇒ Object
#records(_opts = {}) ⇒ Object

Create an Array of all metadata records.
#retrieve_cloud_files(files, _importer) ⇒ Object
#setup_bagit_folder(folder_count, id) ⇒ Object
#setup_csv_metadata_export_file(folder_count, id) ⇒ Object

rubocop:enable Metrics/MethodLength, Metrics/AbcSize.
#setup_triple_metadata_export_file(folder_count, id) ⇒ Object
#unzip_attachments_only(file_to_unzip) ⇒ Object
#unzip_with_primary_csv(file_to_unzip) ⇒ Object

BagIt archives are not CSV imports: they don’t contain a primary CSV at a shallowest level, and their structure (bagit.txt + data/ + manifests) must be preserved verbatim.
#valid_import? ⇒ Boolean
#write_files ⇒ Object

rubocop:disable Metrics/MethodLength, Metrics/AbcSize.
#write_triples(folder_count, e) ⇒ Object

@todo(bjustice) - remove hyrax reference.

Constructor Details

This class inherits a constructor from Bulkrax::ApplicationParser

Class Method Details

.export_supported? ⇒ `Boolean`

Returns:

(Boolean)



8
9
10

# File 'app/parsers/bulkrax/bagit_parser.rb', line 8

def self.export_supported?
  true
end

Instance Method Details

#entry_class ⇒ `Object`

# File 'app/parsers/bulkrax/bagit_parser.rb', line 19

def entry_class
  rdf_format = parser_fields&.[]('metadata_format') == "Bulkrax::RdfEntry"
  rdf_format ? RdfEntry : CsvEntry
end

#get_data(bag, data) ⇒ `Object`

# File 'app/parsers/bulkrax/bagit_parser.rb', line 61

def get_data(bag, data)
  if entry_class == CsvEntry
    data = data.map do |data_row|
      record_data = entry_class.data_for_entry(data_row, source_identifier, self)
      next record_data if importerexporter.metadata_only?

      record_data[:file] = bag.bag_files.join('|') if Bulkrax.curation_concerns.include? record_data[:model]&.constantize
      record_data
    end
  else
    data = entry_class.data_for_entry(data, source_identifier, self)
    data[:file] = bag.bag_files.join('|') unless importerexporter.metadata_only?
  end

  data
end

#import_fields ⇒ `Object`

Take a random sample of 10 metadata_paths and work out the import fields from that

Raises:

(StandardError)

# File 'app/parsers/bulkrax/bagit_parser.rb', line 41

def import_fields
  raise StandardError, 'No metadata files were found' if metadata_paths.blank?
  @import_fields ||= metadata_paths.sample(10).map do |path|
    entry_class.fields_from_data(entry_class.read_data(path))
  end.flatten.compact.uniq
end

#key_allowed(key) ⇒ `Object`

# File 'app/parsers/bulkrax/bagit_parser.rb', line 146

def key_allowed(key)
  !Bulkrax.reserved_properties.include?(key) &&
    new_entry(entry_class, 'Bulkrax::Exporter').field_supported?(key) &&
    key != source_identifier.to_s
end

#path_to_files(filename:) ⇒ `Object`



24
25
26

# File 'app/parsers/bulkrax/bagit_parser.rb', line 24

def path_to_files(filename:)
  @path_to_files ||= Dir.glob(File.join(import_file_path, '**/data', filename)).first
end

#records(_opts = {}) ⇒ `Object`

Create an Array of all metadata records

Raises:

(StandardError)

# File 'app/parsers/bulkrax/bagit_parser.rb', line 49

def records(_opts = {})
  raise StandardError, 'No BagIt records were found' if bags.blank?
  @records ||= bags.map do |bag|
    path = metadata_path(bag)
    raise StandardError, 'No metadata files were found' if path.blank?
    data = entry_class.read_data(path)
    get_data(bag, data)
  end

  @records = @records.flatten
end

#retrieve_cloud_files(files, _importer) ⇒ `Object`

TODO:

investigate getting directory structure

TODO:

investigate using perform_later, and having the importer check for

DownloadCloudFileJob before it starts

# File 'app/parsers/bulkrax/bagit_parser.rb', line 181

def retrieve_cloud_files(files, _importer)
  # There should only be one zip file for Bagit, take the first
  return if files['0'].blank?
  target_file = File.join(path_for_import, files['0']['file_name'].tr(' ', '_'))
  # Now because we want the files in place before the importer runs
  Bulkrax::DownloadCloudFileJob.perform_now(files['0'], target_file)
  return target_file
end

#setup_bagit_folder(folder_count, id) ⇒ `Object`

# File 'app/parsers/bulkrax/bagit_parser.rb', line 159

def setup_bagit_folder(folder_count, id)
  path = File.join(importerexporter.exporter_export_path, folder_count.to_s)
  FileUtils.mkdir_p(path) unless File.exist?(path)

  File.join(path, id)
end

#setup_csv_metadata_export_file(folder_count, id) ⇒ `Object`

rubocop:enable Metrics/MethodLength, Metrics/AbcSize

# File 'app/parsers/bulkrax/bagit_parser.rb', line 139

def setup_csv_metadata_export_file(folder_count, id)
  path = File.join(importerexporter.exporter_export_path, folder_count.to_s)
  FileUtils.mkdir_p(path) unless File.exist?(path)

  File.join(path, id, 'metadata.csv')
end

#setup_triple_metadata_export_file(folder_count, id) ⇒ `Object`

# File 'app/parsers/bulkrax/bagit_parser.rb', line 152

def setup_triple_metadata_export_file(folder_count, id)
  path = File.join(importerexporter.exporter_export_path, folder_count.to_s)
  FileUtils.mkdir_p(path) unless File.exist?(path)

  File.join(path, id, 'metadata.nt')
end

#unzip_attachments_only(file_to_unzip) ⇒ `Object`



36
37
38

# File 'app/parsers/bulkrax/bagit_parser.rb', line 36

def unzip_attachments_only(file_to_unzip)
  unzip(file_to_unzip)
end

#unzip_with_primary_csv(file_to_unzip) ⇒ `Object`

BagIt archives are not CSV imports: they don’t contain a primary CSV at a shallowest level, and their structure (bagit.txt + data/ + manifests) must be preserved verbatim. Override both CSV-flavored unzip entry points to use the base-class verbatim extraction.



32
33
34

# File 'app/parsers/bulkrax/bagit_parser.rb', line 32

def unzip_with_primary_csv(file_to_unzip)
  unzip(file_to_unzip)
end

#valid_import? ⇒ `Boolean`

Returns:

(Boolean)

# File 'app/parsers/bulkrax/bagit_parser.rb', line 12

def valid_import?
  return true if import_fields.present?
rescue => e
  set_status_info(e)
  false
end

#write_files ⇒ `Object`

rubocop:disable Metrics/MethodLength, Metrics/AbcSize

# File 'app/parsers/bulkrax/bagit_parser.rb', line 81

def write_files
  require 'open-uri'
  require 'socket'

  folder_count = 1
  records_in_folder = 0
  work_entries = importerexporter.entries.where(type: work_entry_class.to_s)
  collection_entries = importerexporter.entries.where(type: collection_entry_class.to_s)
  file_set_entries = importerexporter.entries.where(type: file_set_entry_class.to_s)

  work_entries[0..limit || total].each do |entry|
    record = Bulkrax.object_factory.find(entry.identifier)
    next unless record

    bag_entries = [entry]

    if record.member_of_collection_ids.present?
      collection_entries.each { |ce| bag_entries << ce if ce.parsed_metadata.value?(record.id) }
    end

    if record.file_sets.present?
      file_set_entries.each { |fse| bag_entries << fse if fse.parsed_metadata.value?(record.id) }
    end

    records_in_folder += bag_entries.count
    if records_in_folder > records_split_count
      folder_count += 1
      records_in_folder = bag_entries.count
    end

    bag ||= BagIt::Bag.new setup_bagit_folder(folder_count, entry.identifier)

    record.file_sets.each do |fs|
      file_name = filename(fs)
      next if file_name.blank? || fs.original_file.blank?

      io = open(fs.original_file.uri)
      file = Tempfile.new([file_name, File.extname(file_name)], binmode: true)
      file.write(io.read)
      file.close
      begin
        bag.add_file(file_name, file.path) if bag.bag_files.select { |b| b.include?(file_name) }.blank?
      rescue => e
        entry.set_status_info(e)
        set_status_info(e)
      end
    end

    CSV.open(setup_csv_metadata_export_file(folder_count, entry.identifier), "w", headers: export_headers, write_headers: true) do |csv|
      bag_entries.each { |csv_entry| csv << csv_entry.parsed_metadata }
    end

    write_triples(folder_count, entry)
    bag.manifest!(algo: 'sha256')
  end
end

#write_triples(folder_count, e) ⇒ `Object`

@todo(bjustice) - remove hyrax reference

# File 'app/parsers/bulkrax/bagit_parser.rb', line 167

def write_triples(folder_count, e)
  sd = SolrDocument.find(e.identifier)
  return if sd.nil?

  req = ActionDispatch::Request.new({ 'HTTP_HOST' => Socket.gethostname })
  rdf = Hyrax::GraphExporter.new(sd, req).fetch.dump(:ntriples)
  File.open(setup_triple_metadata_export_file(folder_count, e.identifier), "w") do |triples|
    triples.write(rdf)
  end
end

Class: Bulkrax::BagitParser

Overview

Instance Attribute Summary

Attributes inherited from CsvParser

Attributes inherited from ApplicationParser

Class Method Summary collapse

Instance Method Summary collapse

Methods included from ExportBehavior

Methods inherited from CsvParser

Methods included from ErroredEntries

Methods inherited from ApplicationParser

Constructor Details

Class Method Details

.export_supported? ⇒ Boolean

Instance Method Details

#entry_class ⇒ Object

#get_data(bag, data) ⇒ Object

#import_fields ⇒ Object

#key_allowed(key) ⇒ Object

#path_to_files(filename:) ⇒ Object

#records(_opts = {}) ⇒ Object

#retrieve_cloud_files(files, _importer) ⇒ Object

#setup_bagit_folder(folder_count, id) ⇒ Object

#setup_csv_metadata_export_file(folder_count, id) ⇒ Object

#setup_triple_metadata_export_file(folder_count, id) ⇒ Object

#unzip_attachments_only(file_to_unzip) ⇒ Object

#unzip_with_primary_csv(file_to_unzip) ⇒ Object

#valid_import? ⇒ Boolean

#write_files ⇒ Object

#write_triples(folder_count, e) ⇒ Object