Class: Bulkrax::CsvEntry

Inherits:
Entry show all
Defined in:
app/models/bulkrax/csv_entry.rb

Overview

TODO: We need to rework this class some to address the Metrics/ClassLength rubocop offense. We do too much in these entry classes. We need to extract the common logic from the various entry models into a module that can be shared between them.

Direct Known Subclasses

CsvCollectionEntry, CsvFileSetEntry

Defined Under Namespace

Modules: AttributeBuilderMethod Classes: CsvPathError, CsvWrapper, MissingMetadata, RecordNotFound

Instance Attribute Summary

Attributes inherited from Entry

#all_attrs

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from Entry

#build, #exporter?, #fetch_field_mapping, #find_collection, #importer?, #last_run, parent_field, #source_identifier, #work_identifier

Methods included from HasLocalProcessing

#add_local

Methods included from StatusInfo

#current_status, #failed?, #last_error, #set_status_info, #skipped?, #status, #status_at, #succeeded?

Methods included from ExportBehavior

#build_for_exporter, #file_extension, #filename, #hyrax_record

Methods included from ImportBehavior

#active_id_for_authority?, #add_admin_set_id, #add_collections, #add_rights_statement, #add_user_to_permission_templates!, #add_visibility, #build_for_importer, #child_jobs, #factory, #factory_class, #override_rights_statement, #parent_jobs, #rights_statement, #sanitize_controlled_uri_value, #sanitize_controlled_uri_values!, #validate_value

Methods included from HasMatchers

#add_metadata, #excluded?, #field_supported?, #field_to, #fields_that_are_always_multiple, #fields_that_are_always_singular, #get_object_name, #matched_metadata, #multiple?, #multiple_metadata, #schema_form_definitions, #set_parsed_data, #set_parsed_object_data, #single_metadata, #supported_bulkrax_fields

Class Method Details

.data_for_entry(data, _source_id, parser) ⇒ Object



80
81
82
83
84
85
86
87
88
89
90
# File 'app/models/bulkrax/csv_entry.rb', line 80

def self.data_for_entry(data, _source_id, parser)
  # If a multi-line CSV data is passed, grab the first row
  data = data.first if data.is_a?(CSV::Table)
  # model has to be separated so that it doesn't get mistranslated by to_h
  raw_data = data.to_h
  raw_data[:model] = data[:model] if data[:model].present?
  # If the collection field mapping is not 'collection', add 'collection' - the parser needs it
  # TODO: change to :parents
  raw_data[:parents] = raw_data[parent_field(parser).to_sym] if raw_data.keys.include?(parent_field(parser).to_sym) && parent_field(parser) != 'parents'
  return raw_data
end

.fields_from_data(data) ⇒ Object



32
33
34
# File 'app/models/bulkrax/csv_entry.rb', line 32

def self.fields_from_data(data)
  data.headers.flatten.compact.uniq
end

.matcher_classObject



378
379
380
# File 'app/models/bulkrax/csv_entry.rb', line 378

def self.matcher_class
  Bulkrax::CsvMatcher
end

.read_data(path) ⇒ Object

there’s a risk that this reads the whole file into memory and could cause a memory leak we strip any special characters out of the headers. looking at you Excel

Raises:



40
41
42
43
44
45
46
47
48
49
50
# File 'app/models/bulkrax/csv_entry.rb', line 40

def self.read_data(path)
  raise CsvPathError, 'CSV path empty' if path.blank?
  options = {
    headers: true,
    header_converters: ->(h) { h.to_s.gsub(/[^\w\d\. -]+/, '').strip.to_sym },
    encoding: 'utf-8'
  }.merge(csv_read_data_options)

  results = CSV.read(path, **options)
  csv_wrapper_class.new(results)
end

Instance Method Details

#add_fileObject



159
160
161
162
163
164
165
166
167
168
169
170
171
# File 'app/models/bulkrax/csv_entry.rb', line 159

def add_file
  self.['file'] ||= []
  if record['file']&.is_a?(String)
    self.['file'] = record['file'].split(Bulkrax.multi_value_element_split_on)
  elsif record['file'].is_a?(Array)
    self.['file'] = record['file']
  end
  self.['file'] = self.['file'].map do |f|
    next if f.blank?

    path_to_file(f.tr(' ', '_'))
  end.compact
end

#add_identifierObject



126
127
128
# File 'app/models/bulkrax/csv_entry.rb', line 126

def add_identifier
  self.[work_identifier] = [record[source_identifier]]
end

#add_ingested_metadataObject



150
151
152
153
154
155
156
157
# File 'app/models/bulkrax/csv_entry.rb', line 150

def 
  # we do not want to sort the values in the record before adding the metadata.
  # if we do, the factory_class will be set to the default_work_type for all values that come before "model" or "work type"
  record.each do |key, value|
    index = key[/\d+/].to_i - 1 if key[/\d+/].to_i != 0
    (key_without_numbers(key), value, index)
  end
end

#add_metadata_for_modelObject



136
137
138
139
140
141
142
143
144
145
146
147
148
# File 'app/models/bulkrax/csv_entry.rb', line 136

def 
  if factory_class.present? && factory_class == Bulkrax.collection_model_class
    add_collection_type_gid if defined?(::Hyrax)
    # add any additional collection metadata methods here
  elsif factory_class == Bulkrax.file_model_class
    validate_presence_of_filename!
    add_path_to_file
    validate_presence_of_parent!
  else
    add_file unless importerexporter.
    add_admin_set_id
  end
end

#build_export_metadataObject



173
174
175
176
177
178
179
180
181
182
183
# File 'app/models/bulkrax/csv_entry.rb', line 173

def 
  self. = {}

  
   if Bulkrax.collection_model_class.present? && !hyrax_record.is_a?(Bulkrax.collection_model_class)
  
  
  self.save!

  self.
end

#build_files_metadataObject



197
198
199
200
201
202
203
204
205
206
207
208
# File 'app/models/bulkrax/csv_entry.rb', line 197

def 
  # attaching files to the FileSet row only so we don't have duplicates when importing to a new tenant
  if hyrax_record.work?
    build_thumbnail_files
  else
    file_mapping = key_for_export('file')
    file_sets = hyrax_record.file_set? ? Array.wrap(hyrax_record) : hyrax_record.file_sets
    filenames = map_file_sets(file_sets)

    handle_join_on_export(file_mapping, filenames, mapping['file']&.[]('join')&.present?)
  end
end

#build_mapping_metadataObject



266
267
268
269
270
271
272
273
274
# File 'app/models/bulkrax/csv_entry.rb', line 266

def 
  mapping = fetch_field_mapping
  mapping.each do |key, value|
    method_name = AttributeBuilderMethod.for(key: key, value: value, entry: self)
    next unless method_name

    send(method_name, key, value)
  end
end

#build_metadataObject



92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# File 'app/models/bulkrax/csv_entry.rb', line 92

def 
  validate_record

  self. = {}
  add_identifier
  establish_factory_class
  
  # TODO(alishaevn): remove the collections stuff entirely and only reference collections via the new parents code
  add_collections
  add_visibility
  
  add_rights_statement
  sanitize_controlled_uri_values!
  add_local

  self.
end

#build_metadata_for_deleteObject

limited metadata is needed for delete jobs



111
112
113
114
115
116
# File 'app/models/bulkrax/csv_entry.rb', line 111

def 
  self. = {}
  establish_factory_class
  
  self.
end

#build_object(_key, value) ⇒ Object



276
277
278
279
280
281
282
283
284
# File 'app/models/bulkrax/csv_entry.rb', line 276

def build_object(_key, value)
  return unless hyrax_record.respond_to?(value['object'])

  data = hyrax_record.send(value['object'])
  return if data.empty?

  data = data.to_a if data.is_a?(ActiveTriples::Relation)
  (Array.wrap(data))
end

#build_relationship_metadataObject



210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
# File 'app/models/bulkrax/csv_entry.rb', line 210

def 
  # Includes all relationship methods for all exportable record types (works, Collections, FileSets)
  # @TODO: this logic assumes that the relationships are all available via a method that can be called
  #        on the object. With Valkyrie, this is only true for Hyrax-based models which include the
  #        ArResource module. We need to consider reworking this logic into an object factory method
  #        that can handle different types of models.
  relationship_methods = {
    related_parents_parsed_mapping => %i[member_of_collection_ids member_of_work_ids in_work_ids parent],
    related_children_parsed_mapping => %i[member_collection_ids member_work_ids file_set_ids member_ids]
  }

  relationship_methods.each do |relationship_key, methods|
    next if relationship_key.blank?

    values = []
    methods.each do |m|
      value = hyrax_record.public_send(m) if hyrax_record.respond_to?(m)
      value_id = value.try(:id)&.to_s || value # get the id if it's an object
      values << value_id if value_id.present?
    end
    values = values.flatten.uniq
    next if values.blank?

    handle_join_on_export(relationship_key, values, mapping[related_parents_parsed_mapping]['join'].present?)
  end
end

#build_system_metadataObject

Metadata required by Bulkrax for round-tripping



186
187
188
189
190
191
192
193
194
195
# File 'app/models/bulkrax/csv_entry.rb', line 186

def 
  self.['id'] = hyrax_record.id
  source_id = hyrax_record.send(work_identifier)
  # Because ActiveTriples::Relation does not respond to #to_ary we can't rely on Array.wrap universally
  source_id = source_id.to_a if source_id.is_a?(ActiveTriples::Relation)
  source_id = Array.wrap(source_id).first
  self.[source_identifier] = source_id
  model_name = Bulkrax.object_factory.model_name(resource: hyrax_record)
  self.[key_for_export('model')] = model_name
end

#build_thumbnail_filesObject



353
354
355
356
357
358
359
360
361
# File 'app/models/bulkrax/csv_entry.rb', line 353

def build_thumbnail_files
  return unless importerexporter.include_thumbnails
  thumbnail = Bulkrax.object_factory.thumbnail_for(resource: hyrax_record)
  return unless thumbnail

  filenames = map_file_sets(Array.wrap(thumbnail))
  thumbnail_mapping = 'thumbnail_file'
  handle_join_on_export(thumbnail_mapping, filenames, false)
end

#build_value(property_name, mapping_config) ⇒ Object



286
287
288
289
290
291
292
293
294
295
296
297
298
# File 'app/models/bulkrax/csv_entry.rb', line 286

def build_value(property_name, mapping_config)
  return unless hyrax_record.respond_to?(property_name.to_s)

  data = hyrax_record.send(property_name.to_s)

  if mapping_config['join'] || !data.is_a?(Enumerable)
    self.[key_for_export(property_name)] = prepare_export_data_with_join(data)
  else
    data.each_with_index do |d, i|
      self.["#{key_for_export(property_name)}_#{i + 1}"] = prepare_export_data(d)
    end
  end
end

#collection_identifiersObject



382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
# File 'app/models/bulkrax/csv_entry.rb', line 382

def collection_identifiers
  return @collection_identifiers if @collection_identifiers.present?

  parent_field_mapping = self.class.parent_field(parser)
  return [] unless parent_field_mapping.present? && record[parent_field_mapping].present?

  identifiers = []
  split_references = record[parent_field_mapping].split(Bulkrax.multi_value_element_split_on)
  split_references.each do |c_reference|
    matching_collection_entries = importerexporter.entries.select do |e|
      (e.&.[](source_identifier) == c_reference) &&
        e.is_a?(CsvCollectionEntry)
    end
    raise ::StandardError, 'Only expected to find one matching entry' if matching_collection_entries.count > 1
    identifiers << matching_collection_entries.first&.identifier
  end
  @collection_identifiers = identifiers.compact.presence || []
end

#collections_created?Boolean

Returns:

  • (Boolean)


401
402
403
404
# File 'app/models/bulkrax/csv_entry.rb', line 401

def collections_created?
  # TODO: look into if this method is still needed after new relationships code
  true
end

#establish_factory_classObject



130
131
132
133
134
# File 'app/models/bulkrax/csv_entry.rb', line 130

def establish_factory_class
  parser.model_field_mappings.each do |key|
    ('model', record[key]) if record.key?(key)
  end
end

#find_collection_idsObject



406
407
408
409
410
411
412
413
414
415
416
417
# File 'app/models/bulkrax/csv_entry.rb', line 406

def find_collection_ids
  return self.collection_ids if collections_created?
  if collection_identifiers.present?
    collection_identifiers.each do |collection_id|
      c = find_collection(collection_id)
      skip = c.blank? || self.collection_ids.include?(c.id)
      self.collection_ids << c.id unless skip
    end
  end

  self.collection_ids
end

#handle_join_on_export(key, values, join) ⇒ Object



363
364
365
366
367
368
369
370
371
372
# File 'app/models/bulkrax/csv_entry.rb', line 363

def handle_join_on_export(key, values, join)
  if join
    [key] = values.join(Bulkrax.multi_value_element_join_on)
  else
    values.each_with_index do |value, i|
      ["#{key}_#{i + 1}"] = value
    end
    .delete(key)
  end
end

#key_for_export(key) ⇒ Object

On export the key becomes the from and the from becomes the destination. It is the opposite of the import because we are moving data the opposite direction metadata that does not have a specific Bulkrax entry is mapped to the key name, as matching keys coming in are mapped by the csv parser automatically



302
303
304
305
306
307
# File 'app/models/bulkrax/csv_entry.rb', line 302

def key_for_export(key)
  clean_key = key_without_numbers(key)
  unnumbered_key = mapping[clean_key] ? mapping[clean_key]['from'].first : clean_key
  # Bring the number back if there is one
  "#{unnumbered_key}#{key.sub(clean_key, '')}"
end

#object_metadata(data) ⇒ Object



325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
# File 'app/models/bulkrax/csv_entry.rb', line 325

def (data)
  # NOTE: What is `d` in this case:
  #
  #  "[{\"single_object_first_name\"=>\"Fake\", \"single_object_last_name\"=>\"Fakerson\", \"single_object_position\"=>\"Leader, Jester, Queen\", \"single_object_language\"=>\"english\"}]"
  #
  # The above is a stringified version of a Ruby string.  Using eval is a very bad idea as it
  # will execute the value of `d` within the full Ruby interpreter context.
  #
  # TODO: Would it be possible to store this as a non-string?  Maybe the actual Ruby Array and Hash?
  data = data.map { |d| eval(d) }.flatten # rubocop:disable Security/Eval

  data.each_with_index do |obj, index|
    next if obj.nil?
    # allow the object_key to be valid whether it's a string or symbol
    obj = obj.with_indifferent_access

    obj.each_key do |key|
      if obj[key].is_a?(Array)
        obj[key].each_with_index do |_nested_item, nested_index|
          self.["#{key_for_export(key)}_#{index + 1}_#{nested_index + 1}"] = prepare_export_data(obj[key][nested_index])
        end
      else
        self.["#{key_for_export(key)}_#{index + 1}"] = prepare_export_data(obj[key])
      end
    end
  end
end

#path_to_file(file) ⇒ Object

If only filename is given, construct the path (/files/my_file)



420
421
422
423
424
425
426
427
# File 'app/models/bulkrax/csv_entry.rb', line 420

def path_to_file(file)
  # return if we already have the full file path
  return file if File.exist?(file)
  path = importerexporter.parser.path_to_files
  f = File.join(path, file)
  return f if File.exist?(f)
  raise "File #{f} does not exist"
end

#prepare_export_data(datum) ⇒ Object



317
318
319
320
321
322
323
# File 'app/models/bulkrax/csv_entry.rb', line 317

def prepare_export_data(datum)
  if datum.is_a?(ActiveTriples::Resource)
    datum.to_uri.to_s
  else
    datum
  end
end

#prepare_export_data_with_join(data) ⇒ Object



309
310
311
312
313
314
315
# File 'app/models/bulkrax/csv_entry.rb', line 309

def prepare_export_data_with_join(data)
  # Yes...it's possible we're asking to coerce a multi-value but only have a single value.
  return data.to_s unless data.is_a?(Enumerable)
  return "" if data.empty?

  data.map { |d| prepare_export_data(d) }.join(Bulkrax.multi_value_element_join_on).to_s
end

#recordObject



374
375
376
# File 'app/models/bulkrax/csv_entry.rb', line 374

def record
  @record ||= 
end

#validate_recordObject

Raises:



118
119
120
121
122
123
124
# File 'app/models/bulkrax/csv_entry.rb', line 118

def validate_record
  raise RecordNotFound, 'Record not found' if record.nil?
  unless importerexporter.parser.required_elements?(record)
    raise MissingMetadata, "Missing required elements, missing element(s) are: "\
"#{importerexporter.parser.missing_elements(record).join(', ')}"
  end
end