Class: BulkOps::Parser
- Inherits:
-
Object
- Object
- BulkOps::Parser
- Defined in:
- lib/bulk_ops/parser.rb
Instance Attribute Summary collapse
-
#proxy ⇒ Object
Returns the value of attribute proxy.
-
#raw_data ⇒ Object
Returns the value of attribute raw_data.
-
#raw_row ⇒ Object
Returns the value of attribute raw_row.
Class Method Summary collapse
Instance Method Summary collapse
- #connect_existing_work ⇒ Object
- #delete_file_set(fileset_id) ⇒ Object
- #disambiguate_columns ⇒ Object
- #downcase_first_letter(str) ⇒ Object
- #find_collection(collection) ⇒ Object
- #find_field_name(field) ⇒ Object
- #find_or_create_collection(collection) ⇒ Object
- #find_previous_parent(field = "parent") ⇒ Object
- #find_work_id_from_unique_metadata(field_name, value) ⇒ Object
- #findAuthUrl(auth, value) ⇒ Object
- #format_param_name(name) ⇒ Object
- #format_visibility(value) ⇒ Object
- #format_worktype(value) ⇒ Object
- #get_remote_id(value, authority: nil, property: nil) ⇒ Object
- #get_removed_filesets(filestring) ⇒ Object
- #getLocalAuth(field_name) ⇒ Object
-
#initialize(prx, metadata_sheet = nil) ⇒ Parser
constructor
A new instance of Parser.
- #interpret_controlled_fields ⇒ Object
- #interpret_data(raw_row: nil, raw_data: nil, proxy: nil) ⇒ Object
- #interpret_file_fields ⇒ Object
- #interpret_option_fields ⇒ Object
- #interpret_relationship_fields ⇒ Object
- #interpret_relationship_value(id_type, value, field = "parent") ⇒ Object
- #interpret_scalar_fields ⇒ Object
- #localAuthUrl(property, value) ⇒ Object
- #localIdToUrl(id, auth_name) ⇒ Object
- #mintLocalAuthUrl(auth_name, value) ⇒ Object
- #record_exists?(id) ⇒ Boolean
- #report_error(type, message, **args) ⇒ Object
- #schema ⇒ Object
- #setAdminSet ⇒ Object
- #setMetadataInheritance ⇒ Object
- #split_values(value_string) ⇒ Object
- #unescape_csv(value) ⇒ Object
Constructor Details
#initialize(prx, metadata_sheet = nil) ⇒ Parser
Returns a new instance of Parser.
26 27 28 29 30 31 32 |
# File 'lib/bulk_ops/parser.rb', line 26 def initialize prx, =nil @proxy = prx @raw_data = ( || proxy.operation.) @raw_row = @raw_data[@proxy.row_number] @metadata = {} @parsing_errors = [] end |
Instance Attribute Details
#proxy ⇒ Object
Returns the value of attribute proxy.
5 6 7 |
# File 'lib/bulk_ops/parser.rb', line 5 def proxy @proxy end |
#raw_data ⇒ Object
Returns the value of attribute raw_data.
5 6 7 |
# File 'lib/bulk_ops/parser.rb', line 5 def raw_data @raw_data end |
#raw_row ⇒ Object
Returns the value of attribute raw_row.
5 6 7 |
# File 'lib/bulk_ops/parser.rb', line 5 def raw_row @raw_row end |
Class Method Details
.is_file_set?(metadata, row_number) ⇒ Boolean
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 |
# File 'lib/bulk_ops/parser.rb', line 9 def self.is_file_set? , row_number return false unless [row_number].present? # If the work type is explicitly specified, use that if (type_key = [row_number].to_h.keys.find{|key| key.to_s.downcase.gsub(/[_\-\s]/,"").include?("worktype") }) return true if [row_number][type_key].downcase == "fileset" return false if [row_number][type_key].present? end # Otherwise, if there are any valid fields other than relationship or file fields, call it a work [row_number].each do |field, value| next if BulkOps::Verification.is_file_field?(field) next if ["parent", "order"].include?(normalize_relationship_field_name(field)) next if ["title","label"].include?(field.downcase.strip) return false end return true end |
.normalize_relationship_field_name(field) ⇒ Object
322 323 324 325 |
# File 'lib/bulk_ops/parser.rb', line 322 def self.normalize_relationship_field_name field normfield = field.downcase.parameterize.gsub(/[_\s-]/,'') BulkOps::RELATIONSHIP_FIELDS.find{|rel_field| normfield == rel_field } end |
Instance Method Details
#connect_existing_work ⇒ Object
68 69 70 71 72 73 74 |
# File 'lib/bulk_ops/parser.rb', line 68 def connect_existing_work return unless (column_name = operation.["update_identifier"]) return unless (key = @raw_row.to_h.keys.find{|key| key.to_s.parameterize.downcase.gsub("_","") == column_name.to_s.parameterize.downcase.gsub("_","")}) return unless (value = @raw_row[key]).present? return unless (work_id = (key, value)) proxy.update(work_id: work_id) end |
#delete_file_set(fileset_id) ⇒ Object
465 466 467 |
# File 'lib/bulk_ops/parser.rb', line 465 def delete_file_set fileset_id BulkOps::DeleteFileSetJob.perform_later(fileset_id, operation.user.email ) end |
#disambiguate_columns ⇒ Object
54 55 56 57 58 59 60 61 62 63 64 65 66 |
# File 'lib/bulk_ops/parser.rb', line 54 def disambiguate_columns #do nothing unless there are columns with the same header return unless (@raw_row.respond_to?(:headers) && (@raw_row.headers.uniq.length < @raw_row.length) ) row = {} (0...@raw_row.length).each do |i| header = @raw_row.headers[i] value = @raw_row[i] next unless value.present? # separate values in identical columns using the separator row[header] = (Array(row[header]) << value).join(BulkOps::SEPARATOR) end @raw_row = row end |
#downcase_first_letter(str) ⇒ Object
510 511 512 513 |
# File 'lib/bulk_ops/parser.rb', line 510 def downcase_first_letter(str) return "" unless str str[0].downcase + str[1..-1] end |
#find_collection(collection) ⇒ Object
479 480 481 482 483 484 |
# File 'lib/bulk_ops/parser.rb', line 479 def find_collection(collection) cols = Collection.where(id: collection) cols += Collection.where(title: collection).select{|col| col.title.first == collection} return cols.last unless cols.empty? return false end |
#find_field_name(field) ⇒ Object
506 507 508 |
# File 'lib/bulk_ops/parser.rb', line 506 def find_field_name(field) operation.find_field_name(field) end |
#find_or_create_collection(collection) ⇒ Object
486 487 488 489 490 491 |
# File 'lib/bulk_ops/parser.rb', line 486 def find_or_create_collection(collection) col = find_collection(collection) return col if col return false if collection.to_i > 0 col = Collection.create(title: [collection.to_s], depositor: operation.user.email, collection_type: Hyrax::CollectionType.find_by(title:"User Collection")) end |
#find_previous_parent(field = "parent") ⇒ Object
327 328 329 330 331 332 333 334 335 |
# File 'lib/bulk_ops/parser.rb', line 327 def find_previous_parent field="parent" #Return the row number of the most recent preceding row that does # not itself have a parent defined i = 1; while (prev_row = raw_data[row_number - i]) return (row_number - i) if prev_row[field].blank? i += 1 end end |
#find_work_id_from_unique_metadata(field_name, value) ⇒ Object
76 77 78 79 80 81 82 83 84 |
# File 'lib/bulk_ops/parser.rb', line 76 def field_name, value field_solr_name = schema.get_field(field_name).solr_name query = "_query_:\"{!dismax qf=#{field_solr_name}}#{value}\"" response = ActiveFedora::SolrService.instance.conn.get(ActiveFedora::SolrService.select_path, params: { fq: query, rows: 1, start: 0})["response"] if response["numFound"] > 1 report_error( :id_not_unique , "", row_number: row_number, object_id: @proxy.id, options_name: field_name, option_values: value ) end return response["docs"][0]["id"] end |
#findAuthUrl(auth, value) ⇒ Object
398 399 400 401 402 403 404 405 406 407 408 409 410 |
# File 'lib/bulk_ops/parser.rb', line 398 def findAuthUrl(auth, value) value.strip! return nil if auth.nil? return nil unless (entries = Qa::Authorities::Local.(auth).search(value)) entries.each do |entry| #require exact match next unless entry["label"].force_encoding('UTF-8') == value.force_encoding('UTF-8') url = entry["url"] || entry["id"] # url = localIdToUrl(url,auth) unless url =~ URI::regexp return url end return nil end |
#format_param_name(name) ⇒ Object
498 499 500 |
# File 'lib/bulk_ops/parser.rb', line 498 def format_param_name(name) name.titleize.gsub(/\s+/, "").camelcase(:lower) end |
#format_visibility(value) ⇒ Object
376 377 378 379 380 381 382 383 384 385 |
# File 'lib/bulk_ops/parser.rb', line 376 def format_visibility(value) case value.downcase when "public", "open", "true" return "open" when "campus", "ucsc", "institution" return "ucsc" when "restricted", "private", "closed", "false" return "restricted" end end |
#format_worktype(value) ⇒ Object
367 368 369 370 371 372 373 374 |
# File 'lib/bulk_ops/parser.rb', line 367 def format_worktype(value) # format the value like a class name type = value.titleize.gsub(/[-_\s]/,'') # reject it if it isn't a defined class type = false unless Object.const_defined? type # fall back to the work type defined by the operation, or a standard "Work" return type ||= work_type || operation.work_type || "Work" end |
#get_remote_id(value, authority: nil, property: nil) ⇒ Object
493 494 495 496 |
# File 'lib/bulk_ops/parser.rb', line 493 def get_remote_id(value, authority: nil, property: nil) return false #TODO retrieve URL for this value from the specified remote authr end |
#get_removed_filesets(filestring) ⇒ Object
450 451 452 453 454 455 456 457 458 459 460 461 462 463 |
# File 'lib/bulk_ops/parser.rb', line 450 def get_removed_filesets(filestring) file_ids = split_values(filestring) file_ids.select{|file_id| record_exists?(file_id)} # This part handles filenames in addition to file ids. It doesn't work yet! # file_ids.map do |file_id| # If the filename is the id of an existing record, keep that # next(file_id) if (record_exists?(file_id)) # If this is the label (i.e.filename) of an existing fileset, use that fileset id # TODO MAKE THIS WORK!! # next(filename) if (filename_exists?(filename)) # File.join(BulkOps::INGEST_MEDIA_PATH, filename_prefix, filename) # end end |
#getLocalAuth(field_name) ⇒ Object
420 421 422 423 424 425 426 427 428 429 |
# File 'lib/bulk_ops/parser.rb', line 420 def getLocalAuth(field_name) field = schema.get_property(field_name) # There is only ever one local authority per field, so just pick the first you find if vocs = field.vocabularies vocs.each do |voc| return voc["subauthority"] if voc["authority"].downcase == "local" end end return nil end |
#interpret_controlled_fields ⇒ Object
86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
# File 'lib/bulk_ops/parser.rb', line 86 def interpret_controlled_fields # The labels array tracks the contents of columns marked as labels, # which may require special validation labels = {} # This hash is populated with relevant data as we loop through the fields controlled_data = {} @raw_row.each do |field_name, value| next if value.blank? or field_name.blank? field_name = field_name.to_s #If our CSV interpreter is feeding us the headers as a line, ignore it. next if field_name == value #check if they are using the 'field_name.authority' syntax = nil if ((split=field_name.split('.')).count == 2) = split.last field_name = split.first end # get the field name, if this column is a metadata field field_name_norm = find_field_name(field_name) field = schema.get_field(field_name_norm) # Ignore anything that isn't a controlled field next unless field.present? && field.controlled? # Keep track of label fields if field_name.downcase.ends_with?("label") next if operation.["ignore_labels"] labels[field_name_norm] ||= [] labels[field_name_norm] += split_values value next unless operation.["import_labels"] end remove = field_name.downcase.starts_with?("remove") || field_name.downcase.starts_with?("delete") # handle multiple values value_array = split_values(value) controlled_data[field_name_norm] ||= [] unless value_array.blank? value_array.each do |value| # Decide of we're dealing with a label or url # It's an ID if it's a URL and the name doesn't end in 'label' value.strip! if value =~ /^#{URI::regexp}$/ and !field_name.downcase.ends_with?("label") value_id = value # label = WorkIndexer.fetch_remote_label(value) # error_message = "cannot fetch remote label for url: #{value}" # report_error( :cannot_retrieve_label , error_message, url: value, row_number: row_number) unless label else # It's a label, so unescape it and get the id value = unescape_csv(value) value_id = get_remote_id(value, property: field_name_norm, authority: ) || localAuthUrl(field_name_norm, value) # label = value report_error(:cannot_retrieve_url, message: "cannot find or create url for controlled vocabulary label: #{value}", url: value, row_number: row_number) unless value_id end controlled_data[field_name_norm] << {id: value_id, remove: field_name.downcase.starts_with?("remove")} end end # Actually add all the data controlled_data.each do |property_name, data| @metadata["#{property_name}_attributes"] ||= [] unless data.blank? data.uniq.each do |datum| atts = {"id" => datum[:id]} atts["_delete"] = true if datum[:remove] @metadata["#{property_name}_attributes"] << atts end end end |
#interpret_data(raw_row: nil, raw_data: nil, proxy: nil) ⇒ Object
34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
# File 'lib/bulk_ops/parser.rb', line 34 def interpret_data raw_row: nil, raw_data: nil, proxy: nil @raw_row = raw_row if raw_row.present? @proxy = proxy if proxy.present? @raw_data = raw_data if raw_data.present? disambiguate_columns setAdminSet #The order here matters a little: interpreting the relationship fields specifies containing collections, # which may have opinions about whether we should inherit metadata from parent works interpret_relationship_fields setMetadataInheritance interpret_option_fields interpret_file_fields interpret_controlled_fields interpret_scalar_fields connect_existing_work @proxy.update(status: "ERROR", message: "error parsing spreadsheet line") if @parsing_errors.present? @proxy.proxy_errors = (@proxy.proxy_errors || []) + @parsing_errors return @metadata end |
#interpret_file_fields ⇒ Object
180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 |
# File 'lib/bulk_ops/parser.rb', line 180 def interpret_file_fields # This method handles file additions and deletions from the spreadsheet # if additional files need to be deleted because the update is set to replace # some or all existing files, those replacement-related deletions are handled # by the BulkOps::Operation. # @raw_row.each do |field, value| next if value.blank? or field.blank? field = field.to_s #If our CSV interpreter is feeding us the headers as a line, ignore it. next if field == value # Check if this is a file field, and whether we are removing or adding a file next unless (action = BulkOps::Verification.is_file_field?(field)) # Move on if this field is the name of another property (e.g. masterFilename) next if find_field_name(field) # Check if we are removing a file if action == "remove" get_removed_filesets(value).each { |fileset_id| delete_file_set(file_set_id) } else # Add a file operation.get_file_paths(value).each do |filepath| begin uploaded_file = Hyrax::UploadedFile.create(file: File.open(filepath), user: operation.user) (@metadata[:uploaded_files] ||= []) << uploaded_file.id unless uploaded_file.id.nil? rescue Exception => e report_error(:upload_error, message: "Error opening file: #{ filepath } -- #{e}", file: File.join(BulkOps::INGEST_MEDIA_PATH,filename), row_number: row_number) end end end # Check if any of the upcoming rows are child filesets i = 1 while self.class.is_file_set?(@metadata,row_number+i) child_row.each do |field,value| next if value.blank? title = value if ["title","label"].include?(field.downcase.strip) if BulkOps::Verification.is_file_field?(field) operation.get_file_paths(value).each do |filepath| uploaded_file = Hyrax::UploadedFile.create(file: File.open(filepath), user: operation.user) end end end i+=1 end end end |
#interpret_option_fields ⇒ Object
235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 |
# File 'lib/bulk_ops/parser.rb', line 235 def interpret_option_fields @raw_row.each do |field,value| next if value.blank? or field.blank? field = field.to_s next if value == field normfield = field.downcase.parameterize.gsub(/[_\s-]/,'') if ["visibility", "public"].include?(normfield) @proxy.update(visibility: format_visibility(value)) end if ["worktype","model","type"].include?(normfield) @proxy.update(work_type: format_worktype(value) ) end if ["referenceidentifier", "referenceid", "refid", "referenceidentifiertype", "referenceidtype", "refidtype", "relationshipidentifier", "relationshipid", "relationshipidentifiertype", "relationshipidtype", "relid", "relidtype"].include?(normfield) @proxy.update(reference_identifier: format_reference_id(value)) end end end |
#interpret_relationship_fields ⇒ Object
266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 |
# File 'lib/bulk_ops/parser.rb', line 266 def interpret_relationship_fields @raw_row.each do |field,value| next if value.blank? or field.blank? field = field.to_s value = unescape_csv(value) identifer_type = reference_identifier next if value == field # Correctly interpret the notation "parent:id", "parent id" etc in a column header if (split = field.split(/[:_\-\s]/)).count == 2 identifier_type = split.last relationship_type = split.first.to_s else relationship_type = field end relationship_type = self.class.normalize_relationship_field_name(relationship_type) case relationship_type when "order" # If the field specifies the object's order among siblings @proxy.update(order: value.to_f) next when "collection" # If the field specifies the name or ID of a collection, # find or create the collection and update the metadata to match col = find_or_create_collection(value) ( @metadata[:member_of_collection_ids] ||= [] ) << col.id if col next when "parent", "child" # correctly interpret the notation "id:a78C2d81" identifier_type, object_identifier = interpret_relationship_value(identifier_type, value) relationship_parameters = { work_proxy_id: @proxy.id, identifier_type: identifier_type, relationship_type: relationship_type, object_identifier: object_identifier, status: "new"} #add previous sibling link if necessary previous_value = @raw_data[row_number-1][field] # Check if this is a parent relationship, and the previous row also has one if previous_value.present? && (relationship_type == "parent") # Check if the previous row has the same parent as this row if object_identifier == interpret_relationship_value(identifier_type, previous_value, field).last # If so, set the previous sibling parameter on the relationshp # to the id for the proxy associated with the previous row relationship_parameters[:previous_sibling] = operation.work_proxies.find_by(row_number: row_number-1).id end end BulkOps::Relationship.create(relationship_parameters) end end end |
#interpret_relationship_value(id_type, value, field = "parent") ⇒ Object
337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 |
# File 'lib/bulk_ops/parser.rb', line 337 def interpret_relationship_value id_type, value, field="parent" #Handle "id:20kj4259" syntax if it hasn't already been handled if (split = value.to_s.split(":")).count == 2 id_type, value = split.first value = split.last end #Handle special shorthand syntax for refering to relative row numbers if id_type == "row" #if the value is an integer if value =~ /\A[-+]?[0-9]+\z/ if value.to_i < 0 # if given a negative integer, count backwards from the current row (remember that value.to_i is negative) return [id_type,row_number + value.to_i] elsif value.to_i > 0 # if given a positive integer, remove the row offset value = (value.to_i - BulkOps::ROW_OFFSET).to_s end elsif value.to_s.downcase.include?("prev") # if given any variation of the word "previous", get the first preceding row with no parent of its own return [id_type,find_previous_parent(field)] end end return [id_type,value] end |
#interpret_scalar_fields ⇒ Object
163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 |
# File 'lib/bulk_ops/parser.rb', line 163 def interpret_scalar_fields @raw_row.each do |field, values| next if values.blank? or field.nil? or field == values # get the field name, if this column is a metadata field next unless field_name = find_field_name(field.to_s) field = schema.get_field(field_name) # Ignore controlled fields next if field.controlled? split_values(values).each do |value| next if value.blank? value = value.strip.encode('utf-8', :invalid => :replace, :undef => :replace, :replace => '_') unless value.blank? value = unescape_csv(value) (@metadata[field_name] ||= []) << value end end end |
#localAuthUrl(property, value) ⇒ Object
473 474 475 476 477 |
# File 'lib/bulk_ops/parser.rb', line 473 def localAuthUrl(property, value) return value if (auth = getLocalAuth(property)).nil? url = findAuthUrl(auth, value) || mintLocalAuthUrl(auth,value) return url end |
#localIdToUrl(id, auth_name) ⇒ Object
412 413 414 415 416 417 418 |
# File 'lib/bulk_ops/parser.rb', line 412 def localIdToUrl(id,auth_name) root_urls = {production: "https://digitalcollections.library.ucsc.edu", staging: "http://digitalcollections-staging.library.ucsc.edu", development: "http://#{Socket.gethostname}", test: "http://#{Socket.gethostname}"} return "#{root_urls[Rails.env.to_sym]}/authorities/show/local/#{auth_name}/#{id}" end |
#mintLocalAuthUrl(auth_name, value) ⇒ Object
388 389 390 391 392 393 394 395 396 |
# File 'lib/bulk_ops/parser.rb', line 388 def mintLocalAuthUrl(auth_name, value) value.strip! id = value.parameterize auth = Qa::LocalAuthority.find_or_create_by(name: auth_name) entry = Qa::LocalAuthorityEntry.create(local_authority: auth, label: value, uri: id) return localIdToUrl(id,auth_name) end |
#record_exists?(id) ⇒ Boolean
469 470 471 |
# File 'lib/bulk_ops/parser.rb', line 469 def record_exists? id operation.record_exists? id end |
#report_error(type, message, **args) ⇒ Object
443 444 445 446 447 448 |
# File 'lib/bulk_ops/parser.rb', line 443 def report_error type, , **args puts "ERROR MESSAGE: #{}" @proxy.update(status: "error", message: ) args[:type]=type (@parsing_errors ||= []) << BulkOps::Error.new(**args) end |
#schema ⇒ Object
502 503 504 |
# File 'lib/bulk_ops/parser.rb', line 502 def schema ScoobySnacks::METADATA_SCHEMA end |
#setAdminSet ⇒ Object
431 432 433 434 435 436 |
# File 'lib/bulk_ops/parser.rb', line 431 def setAdminSet return if @metadata[:admin_set_id] asets = AdminSet.where({title: "Bulk Ingest Set"}) asets = AdminSet.find('admin_set/default') if asets.blank? @metadata[:admin_set_id] = Array(asets).first.id unless asets.blank? end |
#setMetadataInheritance ⇒ Object
438 439 440 441 |
# File 'lib/bulk_ops/parser.rb', line 438 def setMetadataInheritance return if @metadata[:metadataInheritance].present? @metadata[:metadataInheritance] = operation.["metadataInheritance"] unless operation.["metadataInheritance"].blank? end |
#split_values(value_string) ⇒ Object
515 516 517 518 519 |
# File 'lib/bulk_ops/parser.rb', line 515 def split_values value_string # Split values on all un-escaped separator character (escape character is '\') # Then replace all escaped separator charactors with un-escaped versions value_string.split(/(?<!\\)#{BulkOps::SEPARATOR}/).map{|val| val.gsub("\\#{BulkOps::SEPARATOR}",BulkOps::SEPARATOR).strip} end |
#unescape_csv(value) ⇒ Object
362 363 364 |
# File 'lib/bulk_ops/parser.rb', line 362 def unescape_csv(value) value.gsub(/\\(['";,])/,'\1') end |