Class: Geomash::Standardizer

Inherits:
Object
  • Object
show all
Defined in:
lib/geomash/standardizer.rb

Class Method Summary collapse

Class Method Details

.dedup_geo(geo_list, aggressive = false) ⇒ Object

Attempt to dedup a list of geographic areas. FIXME: Horrendous first pass. Aggresive flag removes less specific matches. IE. [‘Hanoi, Vietnam’ and ‘Vietnam’] would return just [‘Hanoi, Vietnam’]



124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
# File 'lib/geomash/standardizer.rb', line 124

def self.dedup_geo(geo_list, aggressive=false)
  geo_list = geo_list.clone

   base_word_geo_list = []
   geo_list.each do |geo_term|
     geo_term = geo_term.gsub('(','').gsub(')','').gsub('.','').gsub(',','').gsub(';','')
     #Remove common junk terms
     Geomash::Constants::JUNK_TERMS.each { |term| geo_term.gsub!(term, '') }

     geo_term = geo_term.squish

     base_word_geo_list << geo_term
   end

  indexes_to_remove = []

  0.upto base_word_geo_list.size-1 do |index|
    matched_words_count = []
    current_best_term = geo_list[index]
    current_best_term_index = index

    base_word_geo_list[index].split(' ').each { |word|

      (index+1).upto base_word_geo_list.size-1 do |inner_index|
        if base_word_geo_list[inner_index].split(' ').any? { |single_word| single_word == word }
          matched_words_count[inner_index] ||= 0
          matched_words_count[inner_index] = matched_words_count[inner_index] + 1

        end
      end
    }

    matched_words_count.each_with_index do |matched_count, matched_index|
      matched_count ||= 0

      if (matched_count ==  base_word_geo_list[matched_index].split(' ').size) && ((base_word_geo_list[matched_index].split(' ').size < base_word_geo_list[index].split(' ').size && aggressive) || (base_word_geo_list[matched_index].split(' ').size == base_word_geo_list[index].split(' ').size))
        if current_best_term.split(',').size < geo_list[matched_index].split(',').size || (current_best_term.size+1 < geo_list[matched_index].size && !geo_list[matched_index].include?('('))
          current_best_term =  geo_list[matched_index]
          indexes_to_remove << current_best_term_index
          current_best_term_index = matched_index
        else
          indexes_to_remove << matched_index
        end
      end

    end
  end

  indexes_to_remove.each do |removal_index|
    geo_list[removal_index] = nil
  end

  return geo_list.compact
end

.LCSHize(value) ⇒ Object

Take LCSH subjects and make them standard.



207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
# File 'lib/geomash/standardizer.rb', line 207

def self.LCSHize(value)
  #Remove ending periods ... except when an initial or etc.
  if value.last == '.' && value[-2].match(/[^A-Z]/) && !value[-4..-1].match('etc.')
    value = value.slice(0..-2)
  end

  #Fix when '- -' occurs
  value = value.gsub(/-\s-/,'--')

  #Fix for "em" dashes - two types?
  value = value.gsub('','--')

  #Fix for "em" dashes - two types?
  value = value.gsub('','--')

  #Fix for ' - ' combinations
  value = value.gsub(' - ','--')

  #Remove white space after and before  '--'
  value = value.gsub(/\s+--/,'--')
  value = value.gsub(/--\s+/,'--')

  #Ensure first work is capitalized
  value[0] = value.first.capitalize[0]

  #Strip any white space
  value = strip_value(value)

  return value
end

.parse_for_geographic_term(term) ⇒ Object

Take a subject string and look for potential geographic terms.



6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# File 'lib/geomash/standardizer.rb', line 6

def self.parse_for_geographic_term(term)
  geo_term = ''

  #Likely too long to be an address... some fields have junk with an address string...
  if term.length > 125
    return ''
  end

  term_split_list = term.split(/[,\-\(\(]/).reject{ |e| e.empty? }
  term_split_list.each{ |e| e.gsub!(/[^\w\s]/, "") } #Remove punctuation
  term_split_list.each{ |e| e.strip! } #Remove any extra remaining whitespace
  term_split_list.reject{ |e| e.empty? }
  state_abbr_list = ['Mass']
  state_name_list = []
  country_name_list = []

  #Countries gem of https://github.com/hexorx/countries
  ISO3166::Country.new('US').states.each do |state_abbr, state_names|
    state_abbr_list << ' ' + state_abbr
    state_name_list << state_names["name"]
  end

  ISO3166::Country.all.each do |country_name_hash|
    #country_name_list << country_name_abbr_pair.first
    country_name_list << country_name_hash.data["name"] if country_name_hash.data["name"].present?
    country_name_hash.data["names"].each do |name|
      country_name_list << name
    end
  end
  country_name_list.append('South Korea') #Listed as Korea, Republic of in the gem
  country_name_list.append('North Korea') #Listed as Korea, Democratic People's Republic Of of in the gem

  #Parsing a subject geographic term.
  if (state_name_list & term_split_list).present? || (state_abbr_list & term_split_list).present? || (country_name_list & term_split_list).present?
    if term.include?('--')
      term.split('--').each_with_index do |split_term, index|
        if state_name_list.any? { |state| split_term.include? state } || country_name_list.any? { |country| split_term.include? country }
          #Cases like Naroden Etnografski Muzeĭ (Sofia, Bulgaria)--Catalogs
          if split_term.match(/\([^\)]+\)/)
            geo_term = split_term.gsub('(', ',').gsub(' ,', ', ')
            geo_term = geo_term.gsub(')', '')

=begin
        if split_term.match(/\([^\)]+,[^\)]+\)/)
          geo_term = split_term.match(/\([^\)]+\)/).to_s
          geo_term = geo_term[1..geo_term.length-2]
        #Abbeville (France)--History--20th century.
        elsif split_term.match(/\([^\)]+\)/)
          geo_term = split_term
=end
          else
            geo_term = term.split('--')[index..term.split('--').length-1].reverse!.join(',')
          end

        elsif state_abbr_list.any? { |abbr| split_term.include? abbr }
          geo_term = split_term
        end
      end
      #Other than a '--' field
      #Experimental... example: Palmer (Mass) - history or Stores (retail trade) - Palmer, Mass
    elsif term.include?(' - ')
      term.split(' - ').each do |split_term|
        if state_name_list.any? { |state| split_term.include? state } || state_abbr_list.any? { |abbr| split_term.include? abbr } || country_name_list.any? { |country| split_term.include? country }
          geo_term = split_term
        end

      end
    else
      #if term_split_list.length > 1
      geo_term = term.gsub('(', ',').gsub(' ,', ', ')
      geo_term = geo_term.gsub(')', '')
      #end

    end
  end

  return geo_term
end

.parsed_and_original_check(geo_hash) ⇒ Object



179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
# File 'lib/geomash/standardizer.rb', line 179

def self.parsed_and_original_check(geo_hash)
  term = geo_hash[:standardized_term]

  if geo_hash[:street_part].present? || geo_hash[:coords].present?
    return true
  end

  #Keep original string if three parts at least or if there is a number in the term.
  #TODO: Make this better!
  if (term.split(',').length >= 3 && geo_hash[:neighborhood_part].blank?) || (term.split(',').length >= 2 && geo_hash[:city_part].blank?) || term.split(',').length >= 4 || term.match(/\d/).present?
    return true
  end

  if geo_hash[:country_part] != 'United States'
    if geo_hash[:city_part].blank? && geo_hash[:state_part].blank?
      #Currently do noting
    elsif !((geo_hash[:city_part].present? && term.to_ascii.downcase.include?(geo_hash[:city_part].to_ascii.downcase)) || (geo_hash[:state_part].present? && term.to_ascii.downcase.include?(geo_hash[:state_part].to_ascii.downcase)))
     return true
    end
  end


  return false
end

.standardize_geographic_term(geo_term) ⇒ Object

Make a string in a standard format.



86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# File 'lib/geomash/standardizer.rb', line 86

def self.standardize_geographic_term(geo_term)

  geo_term = geo_term.clone #Don't change original

  #Remove common junk terms
  Geomash::Constants::JUNK_TERMS.each { |term| geo_term.gsub!(term, '') }

  #Strip any leading periods or commas from junk terms
  geo_term = geo_term.gsub(/^[\.,]+/, '').strip

  #Replace any four TGN dashes from removing a junk term
  geo_term = geo_term.gsub('----', '--')

  #Replace any semicolons with commas... possible strip them?
  geo_term = geo_term.gsub(';', ',')

  #Terms in paranthesis will cause some geographic parsers to freak out. Switch to commas instead.
  if geo_term.match(/[\(\)]+/)
    #Attempt to fix address if something like (word)
    if geo_term.match(/ \(+.*\)+/)
      #Make this replacement better?
      geo_term = geo_term.gsub(/ *\((?=[\S ]+\))/,', ')
      geo_term = geo_term.gsub(')', '')

      #Else skip this as data returned likely will be unreliable for now... FIXME when use case occurs.
    else
      return nil
    end
  end

  geo_term = geo_term.squeeze(',')

  return geo_term
end

.strip_value(value) ⇒ Object



238
239
240
241
242
243
244
245
246
247
248
249
# File 'lib/geomash/standardizer.rb', line 238

def self.strip_value(value)
  if(value.blank?)
    return nil
  else
    if value.class == Float || value.class == Fixnum
      value = value.to_i.to_s
    end

    # Make sure it is all UTF-8 and not character encodings or HTML tags and remove any cariage returns
    return utf8Encode(value)
  end
end

.try_with_entered_names(geo_hash) ⇒ Object



257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
# File 'lib/geomash/standardizer.rb', line 257

def self.try_with_entered_names(geo_hash)
  geo_hash_local = geo_hash.clone
  geo_hash_local[:tgn] = nil
  if geo_hash_local[:neighborhood_part].present?
     orig_string_check = geo_hash_local[:standardized_term].gsub(',', ' ').squish.split(' ').select { |value| value.downcase.to_ascii == geo_hash_local[:neighborhood_part].downcase.to_ascii}
     geo_hash_local[:neighborhood_part] = orig_string_check.first.strip if orig_string_check.present? && orig_string_check != geo_hash_local[:neighborhood_part]
     return geo_hash_local
  end

  if geo_hash_local[:city_part].present?
    orig_string_check = geo_hash_local[:standardized_term].gsub(',', ' ').squish.split(' ').select { |value| value.downcase.to_ascii == geo_hash_local[:city_part].downcase.to_ascii}
    geo_hash_local[:city_part] = orig_string_check.first.strip if orig_string_check.present?
    return geo_hash_local
  end


  if geo_hash_local[:state_part].present?
    orig_string_check = geo_hash_local[:standardized_term].gsub(',', ' ').squish.split(' ').select { |value| value.downcase.to_ascii == geo_hash_local[:state_part].downcase.to_ascii}
    geo_hash_local[:state_part] = orig_string_check.first.strip if orig_string_check.present?
    return geo_hash_local
  end

  return nil
end

.utf8Encode(value) ⇒ Object

TODO: Better name for this. Should be part of an overall helped gem.



252
253
254
# File 'lib/geomash/standardizer.rb', line 252

def self.utf8Encode(value)
  return HTMLEntities.new.decode(ActionView::Base.full_sanitizer.sanitize(value.to_s.gsub(/\r?\n?\t/, ' ').gsub(/\r?\n/, ' ').gsub(/<br[\s]*\/>/,' '))).strip
end