Class: Bulkrax::BagitParser

Inherits:
CsvParser show all
Includes:
ExportBehavior
Defined in:
app/parsers/bulkrax/bagit_parser.rb

Overview

rubocop:disable Metrics/ClassLength

Instance Attribute Summary

Attributes inherited from CsvParser

#collections, #file_sets, #works

Attributes inherited from ApplicationParser

#headers, #importerexporter

Class Method Summary collapse

Instance Method Summary collapse

Methods included from ExportBehavior

#build_export_metadata, #build_for_exporter, #filename, #hyrax_record

Methods inherited from CsvParser

#build_records, #collection_entry_class, #collections_total, #create_collections, #create_entry_and_job, #create_file_sets, #create_new_entries, #create_objects, #create_relationships, #current_records_for_export, #export_headers, #export_key_allowed, #file_paths, #file_set_entry_class, #file_sets_total, #missing_elements, #object_names, #records_split_count, #required_elements?, #setup_export_file, #sort_entries, #sort_headers, #store_files, #total, #valid_entry_types, #works_total, #write_partial_import_file

Methods included from ErroredEntries

#build_errored_entry_row, #setup_errored_entries_file, #write_errored_entries_file

Methods inherited from ApplicationParser

#base_path, #collection_entry_class, #collections_total, #create_collections, #create_file_sets, #create_objects, #create_relationships, #exporter?, #file_set_entry_class, #file_sets_total, #find_or_create_entry, #generated_metadata_mapping, #get_field_mapping_hash_for, #import_file_path, import_supported?, #importer?, #initialize, #invalid_record, #limit_reached?, #model_field_mappings, #new_entry, parser_fields, #path_for_import, #perform_method, #record, #record_has_source_identifier, #related_children_parsed_mapping, #related_children_raw_mapping, #related_parents_parsed_mapping, #related_parents_raw_mapping, #required_elements, #setup_export_file, #source_identifier, #total, #unzip, #visibility, #work_identifier, #write, #write_import_file, #zip

Constructor Details

This class inherits a constructor from Bulkrax::ApplicationParser

Class Method Details

.export_supported?Boolean

Returns:

  • (Boolean)


8
9
10
# File 'app/parsers/bulkrax/bagit_parser.rb', line 8

def self.export_supported?
  true
end

Instance Method Details

#create_rdf_worksObject



70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# File 'app/parsers/bulkrax/bagit_parser.rb', line 70

def create_rdf_works
  records.each_with_index do |record, index|
    next unless record_has_source_identifier(record, index)
    break if limit_reached?(limit, index)

    seen[record[source_identifier]] = true
    new_entry = find_or_create_entry(entry_class, record[source_identifier], 'Bulkrax::Importer', record)
    if record[:delete].present?
      DeleteWorkJob.send(perform_method, new_entry, current_run)
    else
      ImportWorkJob.send(perform_method, new_entry.id, current_run.id)
    end
    increment_counters(index, work: true)
  end
  importer.record_status
rescue StandardError => e
  set_status_info(e)
end

#create_worksObject



66
67
68
# File 'app/parsers/bulkrax/bagit_parser.rb', line 66

def create_works
  entry_class == CsvEntry ? super : create_rdf_works
end

#entry_classObject



19
20
21
22
# File 'app/parsers/bulkrax/bagit_parser.rb', line 19

def entry_class
  rdf_format = parser_fields&.[]('metadata_format') == "Bulkrax::RdfEntry"
  rdf_format ? RdfEntry : CsvEntry
end

#get_data(bag, data) ⇒ Object



49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# File 'app/parsers/bulkrax/bagit_parser.rb', line 49

def get_data(bag, data)
  if entry_class == CsvEntry
    data = data.map do |data_row|
      record_data = entry_class.data_for_entry(data_row, source_identifier, self)
      next record_data if importerexporter.

      record_data[:file] = bag.bag_files.join('|') if Bulkrax.curation_concerns.include? record_data[:model]&.constantize
      record_data
    end
  else
    data = entry_class.data_for_entry(data, source_identifier, self)
    data[:file] = bag.bag_files.join('|') unless importerexporter.
  end

  data
end

#import_fieldsObject

Take a random sample of 10 metadata_paths and work out the import fields from that

Raises:

  • (StandardError)


29
30
31
32
33
34
# File 'app/parsers/bulkrax/bagit_parser.rb', line 29

def import_fields
  raise StandardError, 'No metadata files were found' if .blank?
  @import_fields ||= .sample(10).map do |path|
    entry_class.fields_from_data(entry_class.read_data(path))
  end.flatten.compact.uniq
end

#key_allowed(key) ⇒ Object



157
158
159
160
161
# File 'app/parsers/bulkrax/bagit_parser.rb', line 157

def key_allowed(key)
  !Bulkrax.reserved_properties.include?(key) &&
    new_entry(entry_class, 'Bulkrax::Exporter').field_supported?(key) &&
    key != source_identifier.to_s
end

#path_to_files(filename:) ⇒ Object



24
25
26
# File 'app/parsers/bulkrax/bagit_parser.rb', line 24

def path_to_files(filename:)
  @path_to_files ||= Dir.glob(File.join(import_file_path, '**/data', filename)).first
end

#records(_opts = {}) ⇒ Object

Create an Array of all metadata records

Raises:

  • (StandardError)


37
38
39
40
41
42
43
44
45
46
47
# File 'app/parsers/bulkrax/bagit_parser.rb', line 37

def records(_opts = {})
  raise StandardError, 'No BagIt records were found' if bags.blank?
  @records ||= bags.map do |bag|
    path = (bag)
    raise StandardError, 'No metadata files were found' if path.blank?
    data = entry_class.read_data(path)
    get_data(bag, data)
  end

  @records = @records.flatten
end

#retrieve_cloud_files(files) ⇒ Object



192
193
194
195
196
197
198
199
# File 'app/parsers/bulkrax/bagit_parser.rb', line 192

def retrieve_cloud_files(files)
  # There should only be one zip file for Bagit, take the first
  return if files['0'].blank?
  target_file = File.join(path_for_import, files['0']['file_name'].tr(' ', '_'))
  # Now because we want the files in place before the importer runs
  Bulkrax::DownloadCloudFileJob.perform_now(files['0'], target_file)
  return target_file
end

#setup_bagit_folder(folder_count, id) ⇒ Object



170
171
172
173
174
175
# File 'app/parsers/bulkrax/bagit_parser.rb', line 170

def setup_bagit_folder(folder_count, id)
  path = File.join(importerexporter.exporter_export_path, folder_count.to_s)
  FileUtils.mkdir_p(path) unless File.exist?(path)

  File.join(path, id)
end

#setup_csv_metadata_export_file(folder_count, id) ⇒ Object

rubocop:enable Metrics/MethodLength, Metrics/AbcSize



150
151
152
153
154
155
# File 'app/parsers/bulkrax/bagit_parser.rb', line 150

def (folder_count, id)
  path = File.join(importerexporter.exporter_export_path, folder_count.to_s)
  FileUtils.mkdir_p(path) unless File.exist?(path)

  File.join(path, id, 'metadata.csv')
end

#setup_triple_metadata_export_file(folder_count, id) ⇒ Object



163
164
165
166
167
168
# File 'app/parsers/bulkrax/bagit_parser.rb', line 163

def (folder_count, id)
  path = File.join(importerexporter.exporter_export_path, folder_count.to_s)
  FileUtils.mkdir_p(path) unless File.exist?(path)

  File.join(path, id, 'metadata.nt')
end

#valid_import?Boolean

Returns:

  • (Boolean)


12
13
14
15
16
17
# File 'app/parsers/bulkrax/bagit_parser.rb', line 12

def valid_import?
  return true if import_fields.present?
rescue => e
  set_status_info(e)
  false
end

#write_filesObject

rubocop:disable Metrics/MethodLength, Metrics/AbcSize



92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# File 'app/parsers/bulkrax/bagit_parser.rb', line 92

def write_files
  require 'open-uri'
  require 'socket'

  folder_count = 1
  records_in_folder = 0
  work_entries = importerexporter.entries.where(type: work_entry_class.to_s)
  collection_entries = importerexporter.entries.where(type: collection_entry_class.to_s)
  file_set_entries = importerexporter.entries.where(type: file_set_entry_class.to_s)

  work_entries[0..limit || total].each do |entry|
    record = ActiveFedora::Base.find(entry.identifier)
    next unless record

    bag_entries = [entry]

    if record.member_of_collection_ids.present?
      collection_entries.each { |ce| bag_entries << ce if ce..value?(record.id) }
    end

    if record.file_sets.present?
      file_set_entries.each { |fse| bag_entries << fse if fse..value?(record.id) }
    end

    records_in_folder += bag_entries.count
    if records_in_folder > records_split_count
      folder_count += 1
      records_in_folder = bag_entries.count
    end

    bag ||= BagIt::Bag.new setup_bagit_folder(folder_count, entry.identifier)

    record.file_sets.each do |fs|
      file_name = filename(fs)
      next if file_name.blank? || fs.original_file.blank?

      io = open(fs.original_file.uri)
      file = Tempfile.new([file_name, File.extname(file_name)], binmode: true)
      file.write(io.read)
      file.close
      begin
        bag.add_file(file_name, file.path) if bag.bag_files.select { |b| b.include?(file_name) }.blank?
      rescue => e
        entry.set_status_info(e)
        set_status_info(e)
      end
    end

    CSV.open((folder_count, entry.identifier), "w", headers: export_headers, write_headers: true) do |csv|
      bag_entries.each { |csv_entry| csv << csv_entry. }
    end

    write_triples(folder_count, entry)
    bag.manifest!(algo: 'sha256')
  end
end

#write_triples(folder_count, e) ⇒ Object

@todo(bjustice) - remove hyrax reference



178
179
180
181
182
183
184
185
186
187
# File 'app/parsers/bulkrax/bagit_parser.rb', line 178

def write_triples(folder_count, e)
  sd = SolrDocument.find(e.identifier)
  return if sd.nil?

  req = ActionDispatch::Request.new({ 'HTTP_HOST' => Socket.gethostname })
  rdf = Hyrax::GraphExporter.new(sd, req).fetch.dump(:ntriples)
  File.open((folder_count, e.identifier), "w") do |triples|
    triples.write(rdf)
  end
end