Module: Revs::Utils

Defined in:
lib/revs-utils.rb,
lib/revs-utils/version.rb

Constant Summary collapse

AUTOMOBILE_LC_TERMS =

a hash of LC Subject Heading terms and their IDs for linking for “Automobiles” id.loc.gov/authorities/subjects/sh85010201.html this is cached and loaded from disk and deserialized back into a hash for performance reasons, then stored as a module level constant so it can be reused throughout the pre-assembly run as a constant

This cached set of terms can be re-generated with "ruby bin/revs_lc_automobile_terms.rb"
File.open(REVS_LC_TERMS_FILENAME,'rb'){|io| Marshal.load(io)}
REVS_MANIFEST_HEADERS_FILE =
File.open(REVS_MANIFEST_HEADERS_FILEPATH)
REVS_MANIFEST_HEADERS =
YAML.load( REVS_MANIFEST_HEADERS_FILE)
ARCHIVE_DRUIDS =

a hash of druids of the master archives, keys are arbitrary but druids must match the druids in DOR

{:revs=>'nt028fd5773',:roadandtrack=>'mr163sv5231'}
ARCHIVE_NAMES =

the names of the archives (will be used when indexing instead of the title of the collection itself)

{:revs=>'Revs Institute® Archives',:roadandtrack=>'Road & Track Archive'}
MULTI_COLLECTION_ARCHIVES =

list the keys from the hash above for any archives that contain multiple collections (like Revs), for which each item in DOR belongs to both a parent collection and the master archive collection … since we do not want to also add the master archive name as another collection druid to each record, we skip them

[:revs]
VERSION =
"2.2.0"

Instance Method Summary collapse

Instance Method Details

#blank_value?(value) ⇒ Boolean

tells you if have a blank value or an array that has just blank values

Returns:

  • (Boolean)


114
115
116
# File 'lib/revs-utils.rb', line 114

def blank_value?(value)
   value.class == Array ? !value.delete_if(&:blank?).any? : value.blank?
end

#check_headers(csv_data) ⇒ Object

pass in csv data from a file read in and it will tell you if the headers are valid



230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
# File 'lib/revs-utils.rb', line 230

def check_headers(csv_data)

  result1=result2=true
  file_headers=csv_data[0].keys.reject(&:blank?).collect(&:downcase)
  #The file doesn't need to have all the metadata values, it just can't have headers that aren't used for metadata or registration
  if file_headers.include?('date') && file_headers.include?('year') # can't have both date and year
    puts "has both year and date columns"
    result1=false
  end
  if file_headers.include?('location') && file_headers.include?('state') && file_headers.include?('city') && file_headers.include?('country') # can't have both location and the specific fields
    puts "has location column as well as specific state,city,country columns"
    result2=false
  end
  extra_columns = file_headers-get_manifest_section(METADATA).values-get_manifest_section(REGISTER).values-get_manifest_section(OPTIONAL).values
  has_extra_columns = (extra_columns == [])
  puts "has unknown columns: #{extra_columns.join(', ')}" unless has_extra_columns
  result3 = has_extra_columns

  return (result1 && result2 && result3)

end

#check_metadata(csv_data) ⇒ Object

looks at certain metadata fields in manifest to confirm validity (such as dates and formats)



216
217
218
219
220
221
222
223
224
225
226
227
# File 'lib/revs-utils.rb', line 216

def (csv_data)
  bad_rows=0
  csv_data.each_with_index do |row,i|
    valid_date=revs_is_valid_datestring?(row[get_manifest_section(METADATA)['year']] || row[get_manifest_section(METADATA)['date']])
    valid_format=revs_is_valid_format?(row[get_manifest_section(METADATA)['format']])
    unless (valid_date && valid_format)
      bad_rows+=1
      puts "Row #{i}: #{row[get_manifest_section(REGISTER)['sourceid']]} has a bad year/date or format"
    end
  end
  return bad_rows
end

#check_valid_to_register(csv_data) ⇒ Object

pass in csv data and it will tell if you everything is safe to register based on having labels, unique sourceIDs and filenames matching sourceIDs



183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
# File 'lib/revs-utils.rb', line 183

def check_valid_to_register(csv_data)
  #Make sure all the required headers are there
  result1=result2=result3=result4=true
  if not get_manifest_section(REGISTER).values-csv_data[0].keys == []
    puts "missing headers required for registration"
    result1=false
  end
  sources=Array.new
  #Make sure all files have entries for those required headers
  csv_data.each_with_index do |row,i|
    get_manifest_section(REGISTER).keys.each do |header| # label should be there as a column but does not always need a value
       if header.downcase !='label' && row[header].blank?
         puts "Row #{i}: #{row[get_manifest_section(REGISTER)['sourceid']]} does not have a value for a required registration field"
         result2=false
       end
    end
    fname = row[get_manifest_section(REGISTER)['filename']].chomp(File.extname(row[get_manifest_section(REGISTER)['filename']]))
    if ((row[get_manifest_section(REGISTER)['sourceid']] != fname) || ((/\s/ =~ row[get_manifest_section(REGISTER)['sourceid']].strip) != nil))
      puts "Row #{i}: #{row[get_manifest_section(REGISTER)['sourceid']]} does not match the filename or has a space in it"
      result3=false
    end
    sources << row[get_manifest_section(REGISTER)['sourceid']]
  end
  result4 = (sources.uniq.size == sources.size)
  unless result4
    puts "sourceIDs are not all unique"
    puts sources.uniq.map { | e | [sources.count(e), e] }.select { | c, _ | c > 1 }.sort.reverse.map { | c, e | "#{e}: #{c}" } # show all non-unique sourceIDs and their frequency
  end
  return (result1 && result2 && result3 && result4)

end

#clean_collection_name(name) ⇒ Object



252
253
254
255
256
257
258
259
260
261
# File 'lib/revs-utils.rb', line 252

def clean_collection_name(name)
  return "" if name.blank? || name.nil?
  name=name.to_s
  name.gsub!(/\A(the )/i,'')
  name.gsub!(/( of the revs institute)\z/i,'')
  name.gsub!(/( of the revs institute for automotive research)\z/i,'')
  name.gsub!(/( of the revs institute for automotive research, inc)\z/i,'')
  name.gsub!(/( of the revs institute for automotive research, inc.)\z/i,'')
  return name.strip
end

#clean_marque_name(name) ⇒ Object



263
264
265
266
267
268
269
# File 'lib/revs-utils.rb', line 263

def clean_marque_name(name)
  return "" if name.blank? || name.nil?
  name=name.to_s
  name.gsub!(/(automobiles)\z/i,'')
  name.gsub!(/(automobile)\z/i,'')
  return name.strip
end

#get_full_date(date_string) ⇒ Object

tell us if the string passed is in is a full date of the format M/D/YYYY or m-d-yyyy or m-d-yy or M/D/YY, and returns the date object if it is valid



395
396
397
398
399
400
401
402
403
404
# File 'lib/revs-utils.rb', line 395

def get_full_date(date_string)
  begin
    return false if date_string.scan(/(-|\/)/).count < 2 # we need at least two / or - characters to count as a full date
    date_obj=Chronic.parse(date_string).to_date
    date_obj=date_obj.prev_year(100) if date_obj > Date.today # if the parsing yields a date in the future, this is a problem, so adjust back a century (due to this issue: http://stackoverflow.com/questions/27058068/ruby-incorrectly-parses-2-digit-year)
    is_valid_year?(date_obj.year.to_s) ? date_obj : false
  rescue
    false
  end
end

#get_manifest_section(section) ⇒ Object



122
123
124
# File 'lib/revs-utils.rb', line 122

def get_manifest_section(section)
  return REVS_MANIFEST_HEADERS[section]
end

#is_valid_year?(date_string, starting_year = 1800) ⇒ Boolean

tell us if the string passed is a valid year

Returns:

  • (Boolean)


382
383
384
# File 'lib/revs-utils.rb', line 382

def is_valid_year?(date_string,starting_year=1800)
  date_string.to_s.strip.scan(/\D/).empty? and (starting_year..Date.today.year).include?(date_string.to_i)
end

#manifest_headers_fileObject



126
127
128
# File 'lib/revs-utils.rb', line 126

def manifest_headers_file()
  return REVS_MANIFEST_HEADERS_FILE
end

#manifest_headers_pathObject



130
131
132
# File 'lib/revs-utils.rb', line 130

def manifest_headers_path()
  return MAINFEST_HEADERS_FILEPATH
end

#manifest_metadata_section_nameObject



138
139
140
# File 'lib/revs-utils.rb', line 138

def ()
  return METADATA
end

#manifest_register_section_nameObject



134
135
136
# File 'lib/revs-utils.rb', line 134

def manifest_register_section_name()
  return REGISTER
end

#parse_location(row, location) ⇒ Object



271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
# File 'lib/revs-utils.rb', line 271

def parse_location(row, location)
  row[location].split(/[,|]/).reverse.each do |local|
    country = revs_get_country(local)
    city_state = revs_get_city_state(local)
    row['country'] = country.strip if country
    if city_state
      row['state'] = revs_get_state_name(city_state[1].strip)
      row['city'] = city_state[0].strip
    end
    if not city_state and not country
      row['city_section'] = local
    end
  end

  return row
end

#parse_years(date_string) ⇒ Object

given a string with dates separated by commas, split into an array also, parse dates like “195x” and “1961-62” into all dates in that range



408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
# File 'lib/revs-utils.rb', line 408

def parse_years(date_string)
  date_string.delete!(' ')
  if date_string.include?('|')
    result=date_string.split('|')
  else
    result=date_string.split(',')
  end
  years_to_add=[]
  result.each do |year|

    if year.scan(/[1-2][0-9][0-9][0-9][-][0-9][0-9]/).size > 0 && year.size == 7 # if we have a year that looks like "1961-62" or "1961-73", lets deal with it turning it into [1961,1962] or [1961,1962,1963,1964,1965,1966,1967...etc]
      start_year=year[2..3]
      end_year=year[5..6]
      stem=year[0..1]
      for n in start_year..end_year
        years_to_add << "#{stem}#{n}"
      end
    elsif year.scan(/[1-2][0-9][0-9][0-9][-][1-9]/).size > 0 && year.size == 6 # if we have a year that lloks like "1961-2" or "1961-3", lets deal with it turning it into [1961,1962] or [1961,1962,1963]
      start_year=year[3..3]
      end_year=year[5..5]
      stem=year[0..2]
      for n in start_year..end_year
        years_to_add << "#{stem}#{n}"
      end
    end

    if year.scan(/[1-2][0-9][0-9][0](('s)|s)/).size > 0 || year.scan(/[1-2][0-9][0-9][x_]/).size > 0 # if we have a year that looks like "195x", let's deal with it by turning it into [1950,1951,1952..etc]
      result.delete(year) # first delete the year itself from the list
      stem=year[0..2] # next get the stem, and expand into the whole decade
      %w{0 1 2 3 4 5 6 7 8 9}.each {|n| years_to_add << "#{stem}#{n}"} # add each year in that decade to the output array
    end

    if year.scan(/[1-2][0-9][0-9][0-9][-][1-2][0-9][0-9][0-9]/).size > 0 && year.size == 9 # if we have a year that lloks like "1961-1962" or "1930-1955", lets deal with it turning it into [1961,1962] or [1961,1962,1963]
      start_year=year[0..3]
      end_year=year[5..8]
      if end_year.to_i - start_year.to_i < 10 # let's only do the expansion if we don't have some really large date range, like "1930-1985" .. only ranges less than 9 years will be split into separate years
        for n in start_year..end_year
          years_to_add << n
        end
      end
    end

  end

  result = result.uniq
  result.each do |year|
    result.delete(year) if not year.scan(/\A[1-2][0-9][0-9][0-9]\z/).size == 1  #If it doesn't fit the format #### remove it
  end
  return result.concat(years_to_add).uniq.sort

end

#read_csv_with_headers(file) ⇒ Object



142
143
144
145
146
147
148
# File 'lib/revs-utils.rb', line 142

def read_csv_with_headers(file)
  # load CSV into an array of hashes, allowing UTF-8 to pass through, deleting blank columns
  #file_contents = IO.read(file).force_encoding("ISO-8859-1").encode("utf-8", replace: nil)
  file_contents = IO.read(file)
  csv = CSV.parse(file_contents, :headers => true)
  return csv.map { |row| row.to_hash.with_indifferent_access }
end

#revs_check_format(format) ⇒ Object

check a single format and fix some common issues



296
297
298
# File 'lib/revs-utils.rb', line 296

def revs_check_format(format)
  return revs_check_formats([format]).first
end

#revs_check_formats(format) ⇒ Object

check the incoming array of formats and fix some common issues



301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
# File 'lib/revs-utils.rb', line 301

def revs_check_formats(format)
  known_fixes = {"black-and-white negative"=>"black-and-white negatives",
                 "color negative"=>"color negatives",
                 "slides/color transparency"=>"color transparencies",
                 "color negatives/slides"=>"color negatives",
                 "black-and-white negative strips"=>"black-and-white negatives",
                 "black and white"=>"black-and-white negatives",
                 "black-and-white"=>"black-and-white negatives",
                 "black and white negative"=>"black-and-white negatives",
                 "black and white negatives"=>"black-and-white negatives",
                 "color transparency"=>"color transparencies",
                 "slide"=>"slides",
                 "color transparancies"=>"color transparencies"
               }
  count = 0
  format.each do |f|
    format[count] = known_fixes[f.downcase] || f.downcase
    count += 1
  end
  return format
end

#revs_compute_score(doc_hash) ⇒ Object

these are used in the revs solr document in the main revs digital library rails app, as well as the revs-indexing-service app



91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# File 'lib/revs-utils.rb', line 91

def revs_compute_score(doc_hash)

  doc_hash=doc_hash.with_indifferent_access
  total_score=0
  total_weights=0
  revs_field_mappings.each do |field_name,field_config|
    if !field_config[:weight].blank?
      total_score += field_config[:weight].to_f * (blank_value?(doc_hash[field_config[:field]]) ? 0 : 1) # if the field is blank, it is a 0 regardless of weight, otherwise it is a 1 times its weight
      total_weights += field_config[:weight].to_f
    end
  end

  # now we will account for the location, which has a weighting of 3 for *any* location like field having a value
  location_score = (revs_location(doc_hash).blank? && doc_hash[:venue].blank? && doc_hash[:event].blank?) ? 0 : 1
  location_weight = 3
  total_weights += location_weight
  total_score += (location_score * location_weight)

  return ((total_score/total_weights)*100).ceil

end

#revs_field_mappingsObject

these are used in the revs solr document in the main revs digital library rails app, as well as the revs-indexing-service app



37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# File 'lib/revs-utils.rb', line 37

def revs_field_mappings
  {
    :title=>{:field=>'title_tsi',:default=>'Untitled'},
    :description=>{:field=>'description_tsim', :multi_valued => true, :weight => 3},
    :photographer=>{:field=>'photographer_ssi', :weight => 1},
    :years=>{:field=>'pub_year_isim', :multi_valued => true, :weight => 5},
    :single_year=>{:field=>'pub_year_single_isi'},
    :full_date=>{:field=>'pub_date_ssi'},
    :people=>{:field=>'people_ssim', :multi_valued => true, :weight => 4},
    :subjects=>{:field=>'subjects_ssim', :multi_valued => true},
    :city_section=>{:field=>'city_sections_ssi'},
    :city=>{:field=>'cities_ssi'},
    :state=>{:field=>'states_ssi'},
    :country=>{:field=>'countries_ssi'},
    :formats=>{:field=>'format_ssim', :multi_valued => true},
    :original_size=>{:field=>'original_size_ssi'},
    :identifier=>{:field=>'source_id_ssi'},
    :production_notes=>{:field=>'prod_notes_tsi'},
    :institutional_notes=>{:field=>'inst_notes_tsi'},
    :metadata_sources=>{:field=>'metadata_sources_tsi'},
    :has_more_metadata=>{:field=>'has_more_metadata_ssi'},
    :vehicle_markings=>{:field=>'vehicle_markings_tsi', :weight => 1},
    :marque=>{:field=>'marque_ssim', :multi_valued => true, :weight => 4},
    :vehicle_model=>{:field=>'model_ssim', :multi_valued => true, :weight => 2},
    :model_year=>{:field=>'model_year_ssim', :multi_valued => true, :weight => 1},
    :current_owner=>{:field=>'current_owner_tsi', :weight => 1},
    :entrant=>{:field=>'entrant_ssim', :multi_valued => true, :weight => 1},
    :venue=>{:field=>'venue_ssi'},
    :engine_type=>{:field=>'engine_type_ssi'},
    :track=>{:field=>'track_ssi', :weight => 1},
    :event=>{:field=>'event_ssi'},
    :group_class=>{:field=>'group_class_tsi', :weight => 1}, # this field is being split into two separate fields, but we are keeping the old field until all data is migrated # July 2016 Peter Mangiafico
    :car_group=>{:field=>'group_ssim', :multi_valued => true, :weight => 1},
    :car_class=>{:field=>'class_ssi', :weight => 1},
    :race_data=>{:field=>'race_data_tsi', :weight => 1},
    :priority=>{:field=>'priority_isi',:default=>0,:editstore=>false},
    :collections=>{:field=>'is_member_of_ssim', :multi_valued => true},
    :collection_names=>{:field=>'collection_ssim', :multi_valued => true,:editstore=>false},
    :archive_name=>{:field=>'archive_ssi',:editstore=>false},
    :highlighted=>{:field=>'highlighted_ssi',:editstore=>false},
    :visibility_value=>{:field=>'visibility_isi', :editstore=>false},
    :score=>{:field=>'score_isi', :editstore=>false},
    :timestamp=>{:field=>'timestamp', :editstore=>false},
    :resaved_at=>{:field=>'resaved_at_ssi', :editstore=>false}
  }
end

#revs_get_city_state(name) ⇒ Object

parse a string like this: “San Mateo (Calif.)” to try and figure out if there is any state in there; if found, return the city and state as an array, if none found, return false



355
356
357
358
359
360
361
362
363
364
365
# File 'lib/revs-utils.rb', line 355

def revs_get_city_state(name)
  state_match=name.match(/[(]\S+[)]/)
  if state_match.nil?
    return false
  else
    first_match=state_match[0]
    state=first_match.gsub(/[()]/,'').strip # remove parens and strip
    city=name.gsub(first_match,'').strip # remove state name from input string and strip
    return [city,state]
  end
end

#revs_get_country(name) ⇒ Object

check if the string passed is a country name or code – if so, return the country name, if not a recognized country, return false



343
344
345
346
347
348
349
350
351
352
# File 'lib/revs-utils.rb', line 343

def revs_get_country(name)
  name='US' if name=='USA' # special case; USA is not recognized by the country gem, but US is
  country=Country.find_country_by_name(name.strip) # find it by name
  code=Country.new(name.strip) # find it by code
  if country.nil? && code.data.nil?
    return false
  else
    return (code.data.nil? ? country.name : code.name)
  end
end

#revs_get_state_name(name) ⇒ Object

given an abbreviated state name (e.g. “Calif.” or “CA”) return the full state name (e.g. “California”)



368
369
370
371
372
373
374
375
376
377
378
# File 'lib/revs-utils.rb', line 368

def revs_get_state_name(name)
  test_name=name.gsub('.','').strip.downcase
  us=Country.new('US')
  us.states.each do |key,value|
    if value['name'].downcase.start_with?(test_name) || key.downcase == test_name
      return value['name']
      break
    end
  end
  return name
end

#revs_is_valid_datestring?(date_string) ⇒ Boolean

tell us if the incoming datestring supplied in the manifest column is a valid date, year or list of years

Returns:

  • (Boolean)


387
388
389
390
391
392
# File 'lib/revs-utils.rb', line 387

def revs_is_valid_datestring?(date_string)
  return true if date_string.nil? || date_string.empty?
  is_full_date=(get_full_date(date_string) != false)
  is_year=!parse_years(date_string).empty?
  return is_year || is_full_date
end

#revs_is_valid_format?(format) ⇒ Boolean

checks to see if we have a valid format

Returns:

  • (Boolean)


289
290
291
292
293
# File 'lib/revs-utils.rb', line 289

def revs_is_valid_format?(format)
  return true if format.nil? || format.blank?
  formats=format.split("|").collect{|f| f.strip}
  !formats.collect {|f| revs_known_formats.include?(f)}.uniq.include?(false)
end

#revs_known_formatsObject



118
119
120
# File 'lib/revs-utils.rb', line 118

def revs_known_formats
  get_manifest_section(FORMATS)
end

#revs_location(doc_hash) ⇒ Object

these are used in the revs solr document in the main revs digital library rails app, as well as the revs-indexing-service app



85
86
87
88
# File 'lib/revs-utils.rb', line 85

def revs_location(doc_hash)
  doc_hash=doc_hash.with_indifferent_access
  [doc_hash[:city_sections_ssi],doc_hash[:cities_ssi],doc_hash[:states_ssi],doc_hash[:countries_ssi]].reject(&:blank?).join(', ')
end

#revs_lookup_marque(marque) ⇒ Object

lookup the marque sent to see if it matches any known LC terms, trying a few varieties; returns a hash of the term and its ID if match is found, else returns false



324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
# File 'lib/revs-utils.rb', line 324

def revs_lookup_marque(marque)
  result=false
  variants1=[marque,marque.capitalize,marque.singularize,marque.pluralize,marque.capitalize.singularize,marque.capitalize.pluralize]
  variants2=[]
  variants1.each do |name|
    variants2 << "#{name} automobile"
    variants2 << "#{name} automobiles"
  end
  (variants1+variants2).each do |variant|
    lookup_term=AUTOMOBILE_LC_TERMS[variant]
    if lookup_term
      result={'url'=>lookup_term,'value'=>variant}
      break
    end
  end
  return result
end

#unique_source_ids(file_paths) ⇒ Object

Pass this function a list of all CSVs containing metadata for files you are about to register and it will ensure each sourceid is unique



151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
# File 'lib/revs-utils.rb', line 151

def unique_source_ids(file_paths)
  files = Array.new
  file_paths.each do |fp|
    files << read_csv_with_headers(fp)
  end

  sources = Array.new
  files.each do |file|
    file.each do |row|
      #Make sure the sourceid and filename are the same
      fname = row[get_manifest_section(REGISTER)['filename']].chomp(File.extname(row[get_manifest_section(REGISTER)['filename']]))
      return false if ((row[get_manifest_section(REGISTER)['sourceid']] != fname) || ((/\s/ =~ row[get_manifest_section(REGISTER)['sourceid']].strip) != nil))
      sources << row[get_manifest_section(REGISTER)['sourceid']]
    end
  end
  return sources.uniq.size == sources.size

end

#valid_for_metadata(file_path) ⇒ Object

Pass this function a CSV file and it will return true if the proper headers are there and each entry has the required fields filled in.



177
178
179
180
# File 'lib/revs-utils.rb', line 177

def (file_path)
  file = read_csv_with_headers(file_path)
  return check_headers(file)
end

#valid_to_register(file_path) ⇒ Object

Pass this function a CSV file and it will return true if the proper headers are there and each entry has the required fields filled in



171
172
173
174
# File 'lib/revs-utils.rb', line 171

def valid_to_register(file_path)
  file = read_csv_with_headers(file_path)
  return check_valid_to_register(file)
end