Class: Bplgeo::Standardizer

Inherits:
Object
  • Object
show all
Defined in:
lib/bplgeo/standardizer.rb

Class Method Summary collapse

Class Method Details

.dedup_geo(geo_list, aggressive = false) ⇒ Object

Attempt to dedup a list of geographic areas. FIXME: Horrendous first pass. Aggresive flag removes less specific matches. IE. [‘Hanoi, Vietnam’ and ‘Vietnam’] would return just [‘Hanoi, Vietnam’]



91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# File 'lib/bplgeo/standardizer.rb', line 91

def self.dedup_geo(geo_list, aggressive=false)
  geo_list = geo_list.clone

   base_word_geo_list = []
   geo_list.each do |geo_term|
     geo_term = geo_term.gsub('(','').gsub(')','').gsub('.','').gsub(',','').gsub(';','')
     #Remove common junk terms
     Bplgeo::Constants::JUNK_TERMS.each { |term| geo_term.gsub!(term, '') }

     geo_term = geo_term.squish

     base_word_geo_list << geo_term
   end

  indexes_to_remove = []

  0.upto base_word_geo_list.size-1 do |index|
    matched_words_count = []
    current_best_term = geo_list[index]
    current_best_term_index = index

    base_word_geo_list[index].split(' ').each { |word|

      (index+1).upto base_word_geo_list.size-1 do |inner_index|
        if base_word_geo_list[inner_index].split(' ').any? { |single_word| single_word == word }
          matched_words_count[inner_index] ||= 0
          matched_words_count[inner_index] = matched_words_count[inner_index] + 1

        end
      end
    }

    matched_words_count.each_with_index do |matched_count, matched_index|
      matched_count ||= 0

      if (matched_count ==  base_word_geo_list[matched_index].split(' ').size) && ((base_word_geo_list[matched_index].split(' ').size < base_word_geo_list[index].split(' ').size && aggressive) || (base_word_geo_list[matched_index].split(' ').size == base_word_geo_list[index].split(' ').size))
        if current_best_term.split(',').size < geo_list[matched_index].split(',').size || (current_best_term.size+1 < geo_list[matched_index].size && !geo_list[matched_index].include?('('))
          current_best_term =  geo_list[matched_index]
          indexes_to_remove << current_best_term_index
          current_best_term_index = matched_index
        else
          indexes_to_remove << matched_index
        end
      end

    end
  end

  indexes_to_remove.each do |removal_index|
    geo_list[removal_index] = nil
  end

  return geo_list.compact
end

.LCSHize(value) ⇒ Object

Take LCSH subjects and make them standard.



174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
# File 'lib/bplgeo/standardizer.rb', line 174

def self.LCSHize(value)
  #Remove ending periods ... except when an initial or etc.
  if value.last == '.' && value[-2].match(/[^A-Z]/) && !value[-4..-1].match('etc.')
    value = value.slice(0..-2)
  end

  #Fix when '- -' occurs
  value = value.gsub(/-\s-/,'--')

  #Fix for "em" dashes - two types?
  value = value.gsub('','--')

  #Fix for "em" dashes - two types?
  value = value.gsub('','--')

  #Fix for ' - ' combinations
  value = value.gsub(' - ','--')

  #Remove white space after and before  '--'
  value = value.gsub(/\s+--/,'--')
  value = value.gsub(/--\s+/,'--')

  #Ensure first work is capitalized
  value[0] = value.first.capitalize[0]

  #Strip any white space
  value = strip_value(value)

  return value
end

.parse_for_geographic_term(term) ⇒ Object

Take a subject string and look for potential geographic terms.



5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# File 'lib/bplgeo/standardizer.rb', line 5

def self.parse_for_geographic_term(term)
  geo_term = ''

  #Likely too long to be an address... some fields have junk with an address string...
  if term.length > 125
    return ''
  end

  state_abbr_list = ['Mass']
  state_name_list = []
  country_name_list = []

  #Countries gem of https://github.com/hexorx/countries
  Country.new('US').states.each do |state_abbr, state_names|
    state_abbr_list << ' ' + state_abbr
    state_name_list << state_names["name"]
  end

  Country.all.each do |country_name_abbr_pair|
    country_name_list << country_name_abbr_pair.first
  end

  #Parsing a subject geographic term.
  if term.include?('--')
    term.split('--').each_with_index do |split_term, index|
      if state_name_list.any? { |state| split_term.include? state } || country_name_list.any? { |country| split_term.include? country }
        geo_term = term.split('--')[index..term.split('--').length-1].reverse!.join(',')
      elsif state_abbr_list.any? { |abbr| split_term.include? abbr }
        geo_term = split_term
      end
    end
    #Other than a '--' field
    #Experimental... example: Palmer (Mass) - history or Stores (retail trade) - Palmer, Mass
  elsif term.include?(' - ')
    term.split(' - ').each do |split_term|
      if state_name_list.any? { |state| split_term.include? state } || state_abbr_list.any? { |abbr| split_term.include? abbr } || country_name_list.any? { |country| split_term.include? country }
        geo_term = split_term
      end

    end
  else
    if state_name_list.any? { |state| term.include? state } || state_abbr_list.any? { |abbr| term.include? abbr } || country_name_list.any? { |country| term.include? country }
      geo_term = term
    end
  end

  return geo_term
end

.parsed_and_original_check(geo_hash) ⇒ Object



146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
# File 'lib/bplgeo/standardizer.rb', line 146

def self.parsed_and_original_check(geo_hash)
  term = geo_hash[:standardized_term]

  if geo_hash[:street_part].present? || geo_hash[:coords].present?
    return true
  end

  #Keep original string if three parts at least or if there is a number in the term.
  #TODO: Make this better!
  if (term.split(',').length >= 3 && geo_hash[:neighborhood_part].blank?) || (term.split(',').length >= 2 && geo_hash[:city_part].blank?) || term.split(',').length >= 4 || term.match(/\d/).present?
    return true
  end

  if geo_hash[:country_part] != 'United States'
    if geo_hash[:city_part].blank? && geo_hash[:state_part].blank?
      #Currently do noting
    elsif !((geo_hash[:city_part].present? && term.to_ascii.downcase.include?(geo_hash[:city_part].to_ascii.downcase)) || (geo_hash[:state_part].present? && term.to_ascii.downcase.include?(geo_hash[:state_part].to_ascii.downcase)))
     return true
    end
  end


  return false
end

.standardize_geographic_term(geo_term) ⇒ Object

Make a string in a standard format.



55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# File 'lib/bplgeo/standardizer.rb', line 55

def self.standardize_geographic_term(geo_term)

  geo_term = geo_term.clone #Don't change original

  #Remove common junk terms
  Bplgeo::Constants::JUNK_TERMS.each { |term| geo_term.gsub!(term, '') }

  #Strip any leading periods or commas from junk terms
  geo_term = geo_term.gsub(/^[\.,]+/, '').strip

  #Replace any four TGN dashes from removing a junk term
  geo_term = geo_term.gsub('----', '--')

  #Replace any semicolons with commas... possible strip them?
  geo_term = geo_term.gsub(';', ',')

  #Terms in paranthesis will cause some geographic parsers to freak out. Switch to commas instead.
  if geo_term.match(/[\(\)]+/)
    #Attempt to fix address if something like (word)
    if geo_term.match(/ \(+.*\)+/)
      #Make this replacement better?
      geo_term = geo_term.gsub(/ *\((?=[\S ]+\))/,', ')
      geo_term = geo_term.gsub(')', '')

      #Else skip this as data returned likely will be unreliable for now... FIXME when use case occurs.
    else
      return nil
    end
  end

  return geo_term
end

.strip_value(value) ⇒ Object



205
206
207
208
209
210
211
212
213
214
215
216
# File 'lib/bplgeo/standardizer.rb', line 205

def self.strip_value(value)
  if(value.blank?)
    return nil
  else
    if value.class == Float || value.class == Fixnum
      value = value.to_i.to_s
    end

    # Make sure it is all UTF-8 and not character encodings or HTML tags and remove any cariage returns
    return utf8Encode(value)
  end
end

.try_with_entered_names(geo_hash) ⇒ Object



224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
# File 'lib/bplgeo/standardizer.rb', line 224

def self.try_with_entered_names(geo_hash)
  geo_hash_local = geo_hash.clone
  if geo_hash_local[:neighborhood_part].present?
     orig_string_check = geo_hash_local[:standardized_term].gsub(',', ' ').squish.split(' ').select { |value| value.downcase.to_ascii == geo_hash_local[:neighborhood_part].downcase.to_ascii}
     geo_hash_local[:neighborhood_part] = orig_string_check.first.strip if orig_string_check.present? && orig_string_check != geo_hash_local[:neighborhood_part]
     return geo_hash_local
  end

  if geo_hash_local[:city_part].present?
    orig_string_check = geo_hash_local[:standardized_term].gsub(',', ' ').squish.split(' ').select { |value| value.downcase.to_ascii == geo_hash_local[:city_part].downcase.to_ascii}
    geo_hash_local[:city_part] = orig_string_check.first.strip if orig_string_check.present?
    return geo_hash_local
  end


  if geo_hash_local[:state_part].present?
    orig_string_check = geo_hash_local[:standardized_term].gsub(',', ' ').squish.split(' ').select { |value| value.downcase.to_ascii == geo_hash_local[:state_part].downcase.to_ascii}
    geo_hash_local[:state_part] = orig_string_check.first.strip if orig_string_check.present?
    return geo_hash_local
  end

  return nil
end

.utf8Encode(value) ⇒ Object

TODO: Better name for this. Should be part of an overall helped gem.



219
220
221
# File 'lib/bplgeo/standardizer.rb', line 219

def self.utf8Encode(value)
  return HTMLEntities.new.decode(ActionView::Base.full_sanitizer.sanitize(value.to_s.gsub(/\r?\n?\t/, ' ').gsub(/\r?\n/, ' ').gsub(/<br[\s]*\/>/,' '))).strip
end