Class: SportDb::Import::Variant

Inherits:
Object
  • Object
show all
Defined in:
lib/sportdb/config/variants.rb

Overview

(spelling) variant finder / builder for names

Constant Summary collapse

ALPHA_SPECIALS =

“simple” translation

{
  'Ä'=>'A',  'ä'=>'a',
  'Á'=>'A',  'á'=>'a',
             'à'=>'a',
             'ã'=>'a',
             'â'=>'a',
  'Å'=>'A',  'å'=>'a',
             'æ'=>'ae',
             'ā'=>'a',
             'ă'=>'a',
             'ą'=>'a',

  'Ç' =>'C', 'ç'=>'c',
             'ć'=>'c',
  'Č'=>'C',  'č'=>'c',

  'É'=>'E',  'é'=>'e',
             'è'=>'e',
             'ê'=>'e',
             'ë'=>'e',
             'ė'=>'e',
             'ę'=>'e',

             'ğ'=>'g',

  'İ'=>'I',
  'Í'=>'I',  'í'=>'i',
             'î'=>'i',
             'ī'=>'i',
             'ı'=>'i',

  'Ł'=>'L', 'ł'=>'l',

             'ñ'=>'n',
             'ń'=>'n',
             'ň'=>'n',

  'Ö'=>'O',  'ö'=>'o',
             'ó'=>'o',
             'õ'=>'o',
             'ô'=>'o',
             'ø'=>'o',
             'ő'=>'o',

              'ř'=>'r',

  'Ś'=>'S',
  'Ş'=>'S',  'ş'=>'s',
  'Š'=>'S',  'š'=>'s',
             'ș'=>'s',  ## U+0219
             'ß'=>'ss',

             'ţ'=>'t',  ## U+0163
             'ț'=>'t',  ## U+021B
             'þ'=>'th',

  'Ü'=>'U',  'ü'=>'u',
  'Ú'=>'U',  'ú'=>'u',
             'ū'=>'u',

             'ý'=>'y',

             'ź'=>'z',
             'ż'=>'z',
  'Ž'=>'Z',  'ž'=>'z',
}
ALPHA_SPECIALS_DE =

de,at,ch translation for umlauts

{
  'Ä'=>'Ae',  'ä'=>'ae',
  'Ö'=>'Oe',  'ö'=>'oe',
  'Ü'=>'Ue',  'ü'=>'ue',
              'ß'=>'ss',
}
ALPHA_DOWNCASE =

add ALPHA_SPECIALS_ES - why? why not? is Espanyol catalan spelling or spanish (castillian)? ‘ñ’=>‘ny’, ## e.g. Español => Espanyol

%w[A B C D E F G H I J K L M N O P Q R S T U V W X Y Z].reduce({}) do |h,ch|
  h[ch] = ch.downcase
  h
end.merge(
  'Ä'=>'ä',
  'Á'=>'á',
  'Å'=>'å',

  'Ç'=>'ç',
  'Č'=>'č',

  'É'=>'é',

  'İ'=>'?',   ## fix - add lowercase
  'Í'=>'í',

  'Ł'=>'ł',

  'Ö'=>'ö',

  'Ś'=>'?',   ## fix - add lowercase
  'Ş'=>'ş',
  'Š'=>'š',

  'Ü'=>'ü',
  'Ú'=>'ú',

  'Ž'=>'ž',
)

Class Method Summary collapse

Class Method Details

.alpha_specials_count(freq, mapping) ⇒ Object



132
133
134
135
136
137
# File 'lib/sportdb/config/variants.rb', line 132

def self.alpha_specials_count( freq, mapping )
  mapping.keys.reduce(0) do |count,ch|
    count += freq[ch]
    count
  end
end

.downcase_i18n(name) ⇒ Object

our very own downcase for int’l characters / letters



171
172
173
# File 'lib/sportdb/config/variants.rb', line 171

def self.downcase_i18n( name )    ## our very own downcase for int'l characters / letters
  tr( name, ALPHA_DOWNCASE )
end

.find(name) ⇒ Object



153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
# File 'lib/sportdb/config/variants.rb', line 153

def self.find( name )
  alt_names = []

  freq = frequency_table( name )

  if alpha_specials_count( freq, ALPHA_SPECIALS  ) > 0    # check if includes äöü etc.
    alt_names <<  tr( name, ALPHA_SPECIALS )
  end

  if alpha_specials_count( freq, ALPHA_SPECIALS_DE  ) > 0   ## todo/fix: add / pass-in language/country code and check - why? why not?
    alt_names <<  tr( name, ALPHA_SPECIALS_DE )
  end

  ## todo - make uniq  e.g. Preußen is Preussen, Preussen 2x
  alt_names = alt_names.uniq
  alt_names
end

.frequency_table(name) ⇒ Object

todo/check: use/rename to char_frequency_table



11
12
13
14
15
16
17
18
# File 'lib/sportdb/config/variants.rb', line 11

def self.frequency_table( name )   ## todo/check: use/rename to char_frequency_table
  ## calculate the frequency table of letters, digits, etc.
  freq = Hash.new(0)
  name.each_char do |ch|
     freq[ch] += 1
  end
  freq
end

.tr(name, mapping) ⇒ Object



139
140
141
142
143
144
145
146
147
148
149
# File 'lib/sportdb/config/variants.rb', line 139

def self.tr( name, mapping )
  buf = String.new
  name.each_char do |ch|
    buf << if mapping[ch]
              mapping[ch]
            else
              ch
            end
  end
  buf
end