Class: Bulkrax::BagitParser

Inherits:
CsvParser show all
Includes:
ExportBehavior
Defined in:
app/parsers/bulkrax/bagit_parser.rb

Overview

rubocop:disable Metrics/ClassLength

Instance Attribute Summary

Attributes inherited from CsvParser

#collections, #file_sets, #works

Attributes inherited from ApplicationParser

#headers, #importerexporter

Class Method Summary collapse

Instance Method Summary collapse

Methods included from ExportBehavior

#build_export_metadata, #build_for_exporter, #filename, #hyrax_record

Methods inherited from CsvParser

#build_records, #collection_entry_class, #collections_total, #create_new_entries, #current_records_for_export, #export_headers, #export_key_allowed, #file_paths, #file_set_entry_class, #file_sets_total, #missing_elements, #object_names, #records_split_count, #required_elements?, #setup_export_file, #sort_entries, #sort_headers, #store_files, #total, #valid_entry_types, #works_total, #write_partial_import_file

Methods included from ErroredEntries

#build_errored_entry_row, #setup_errored_entries_file, #write_errored_entries_file

Methods inherited from ApplicationParser

#base_path, #calculate_type_delay, #collection_entry_class, #collections_total, #create_collections, #create_entry_and_job, #create_file_sets, #create_objects, #create_relationships, #create_works, #exporter?, #file_set_entry_class, #file_sets_total, #find_or_create_entry, #generated_metadata_mapping, #get_field_mapping_hash_for, #import_file_path, import_supported?, #importer?, #initialize, #invalid_record, #limit_reached?, #model_field_mappings, #new_entry, parser_fields, #path_for_import, #perform_method, #rebuild_entries, #rebuild_entry_query, #record, #record_deleted?, #record_has_source_identifier, #record_raw_metadata, #record_remove_and_rerun?, #related_children_parsed_mapping, #related_children_raw_mapping, #related_parents_parsed_mapping, #related_parents_raw_mapping, #required_elements, #setup_export_file, #source_identifier, #total, #untar, #unzip, #visibility, #work_entry_class, #work_identifier, #work_identifier_search_field, #write, #write_import_file, #zip

Constructor Details

This class inherits a constructor from Bulkrax::ApplicationParser

Class Method Details

.export_supported?Boolean

Returns:

  • (Boolean)


8
9
10
# File 'app/parsers/bulkrax/bagit_parser.rb', line 8

def self.export_supported?
  true
end

Instance Method Details

#entry_classObject



19
20
21
22
# File 'app/parsers/bulkrax/bagit_parser.rb', line 19

def entry_class
  rdf_format = parser_fields&.[]('metadata_format') == "Bulkrax::RdfEntry"
  rdf_format ? RdfEntry : CsvEntry
end

#get_data(bag, data) ⇒ Object



49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# File 'app/parsers/bulkrax/bagit_parser.rb', line 49

def get_data(bag, data)
  if entry_class == CsvEntry
    data = data.map do |data_row|
      record_data = entry_class.data_for_entry(data_row, source_identifier, self)
      next record_data if importerexporter.

      record_data[:file] = bag.bag_files.join('|') if Bulkrax.curation_concerns.include? record_data[:model]&.constantize
      record_data
    end
  else
    data = entry_class.data_for_entry(data, source_identifier, self)
    data[:file] = bag.bag_files.join('|') unless importerexporter.
  end

  data
end

#import_fieldsObject

Take a random sample of 10 metadata_paths and work out the import fields from that

Raises:

  • (StandardError)


29
30
31
32
33
34
# File 'app/parsers/bulkrax/bagit_parser.rb', line 29

def import_fields
  raise StandardError, 'No metadata files were found' if .blank?
  @import_fields ||= .sample(10).map do |path|
    entry_class.fields_from_data(entry_class.read_data(path))
  end.flatten.compact.uniq
end

#key_allowed(key) ⇒ Object



134
135
136
137
138
# File 'app/parsers/bulkrax/bagit_parser.rb', line 134

def key_allowed(key)
  !Bulkrax.reserved_properties.include?(key) &&
    new_entry(entry_class, 'Bulkrax::Exporter').field_supported?(key) &&
    key != source_identifier.to_s
end

#path_to_files(filename:) ⇒ Object



24
25
26
# File 'app/parsers/bulkrax/bagit_parser.rb', line 24

def path_to_files(filename:)
  @path_to_files ||= Dir.glob(File.join(import_file_path, '**/data', filename)).first
end

#records(_opts = {}) ⇒ Object

Create an Array of all metadata records

Raises:

  • (StandardError)


37
38
39
40
41
42
43
44
45
46
47
# File 'app/parsers/bulkrax/bagit_parser.rb', line 37

def records(_opts = {})
  raise StandardError, 'No BagIt records were found' if bags.blank?
  @records ||= bags.map do |bag|
    path = (bag)
    raise StandardError, 'No metadata files were found' if path.blank?
    data = entry_class.read_data(path)
    get_data(bag, data)
  end

  @records = @records.flatten
end

#retrieve_cloud_files(files, _importer) ⇒ Object

TODO:
  • investigate getting directory structure

TODO:
  • investigate using perform_later, and having the importer check for

DownloadCloudFileJob before it starts



169
170
171
172
173
174
175
176
# File 'app/parsers/bulkrax/bagit_parser.rb', line 169

def retrieve_cloud_files(files, _importer)
  # There should only be one zip file for Bagit, take the first
  return if files['0'].blank?
  target_file = File.join(path_for_import, files['0']['file_name'].tr(' ', '_'))
  # Now because we want the files in place before the importer runs
  Bulkrax::DownloadCloudFileJob.perform_now(files['0'], target_file)
  return target_file
end

#setup_bagit_folder(folder_count, id) ⇒ Object



147
148
149
150
151
152
# File 'app/parsers/bulkrax/bagit_parser.rb', line 147

def setup_bagit_folder(folder_count, id)
  path = File.join(importerexporter.exporter_export_path, folder_count.to_s)
  FileUtils.mkdir_p(path) unless File.exist?(path)

  File.join(path, id)
end

#setup_csv_metadata_export_file(folder_count, id) ⇒ Object

rubocop:enable Metrics/MethodLength, Metrics/AbcSize



127
128
129
130
131
132
# File 'app/parsers/bulkrax/bagit_parser.rb', line 127

def (folder_count, id)
  path = File.join(importerexporter.exporter_export_path, folder_count.to_s)
  FileUtils.mkdir_p(path) unless File.exist?(path)

  File.join(path, id, 'metadata.csv')
end

#setup_triple_metadata_export_file(folder_count, id) ⇒ Object



140
141
142
143
144
145
# File 'app/parsers/bulkrax/bagit_parser.rb', line 140

def (folder_count, id)
  path = File.join(importerexporter.exporter_export_path, folder_count.to_s)
  FileUtils.mkdir_p(path) unless File.exist?(path)

  File.join(path, id, 'metadata.nt')
end

#valid_import?Boolean

Returns:

  • (Boolean)


12
13
14
15
16
17
# File 'app/parsers/bulkrax/bagit_parser.rb', line 12

def valid_import?
  return true if import_fields.present?
rescue => e
  set_status_info(e)
  false
end

#write_filesObject

rubocop:disable Metrics/MethodLength, Metrics/AbcSize



69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# File 'app/parsers/bulkrax/bagit_parser.rb', line 69

def write_files
  require 'open-uri'
  require 'socket'

  folder_count = 1
  records_in_folder = 0
  work_entries = importerexporter.entries.where(type: work_entry_class.to_s)
  collection_entries = importerexporter.entries.where(type: collection_entry_class.to_s)
  file_set_entries = importerexporter.entries.where(type: file_set_entry_class.to_s)

  work_entries[0..limit || total].each do |entry|
    record = Bulkrax.object_factory.find(entry.identifier)
    next unless record

    bag_entries = [entry]

    if record.member_of_collection_ids.present?
      collection_entries.each { |ce| bag_entries << ce if ce..value?(record.id) }
    end

    if record.file_sets.present?
      file_set_entries.each { |fse| bag_entries << fse if fse..value?(record.id) }
    end

    records_in_folder += bag_entries.count
    if records_in_folder > records_split_count
      folder_count += 1
      records_in_folder = bag_entries.count
    end

    bag ||= BagIt::Bag.new setup_bagit_folder(folder_count, entry.identifier)

    record.file_sets.each do |fs|
      file_name = filename(fs)
      next if file_name.blank? || fs.original_file.blank?

      io = open(fs.original_file.uri)
      file = Tempfile.new([file_name, File.extname(file_name)], binmode: true)
      file.write(io.read)
      file.close
      begin
        bag.add_file(file_name, file.path) if bag.bag_files.select { |b| b.include?(file_name) }.blank?
      rescue => e
        entry.set_status_info(e)
        set_status_info(e)
      end
    end

    CSV.open((folder_count, entry.identifier), "w", headers: export_headers, write_headers: true) do |csv|
      bag_entries.each { |csv_entry| csv << csv_entry. }
    end

    write_triples(folder_count, entry)
    bag.manifest!(algo: 'sha256')
  end
end

#write_triples(folder_count, e) ⇒ Object

@todo(bjustice) - remove hyrax reference



155
156
157
158
159
160
161
162
163
164
# File 'app/parsers/bulkrax/bagit_parser.rb', line 155

def write_triples(folder_count, e)
  sd = SolrDocument.find(e.identifier)
  return if sd.nil?

  req = ActionDispatch::Request.new({ 'HTTP_HOST' => Socket.gethostname })
  rdf = Hyrax::GraphExporter.new(sd, req).fetch.dump(:ntriples)
  File.open((folder_count, e.identifier), "w") do |triples|
    triples.write(rdf)
  end
end