Class: String

Inherits:
Object
  • Object
show all
Defined in:
lib/name_tamer/string.rb

Constant Summary collapse

NONBREAKING_SPACE =
"\u00a0".freeze
ASCII_SPACE =
' '.freeze
COMPOUND_NAMES =
[
  'Lane Fox', 'Bonham Carter', 'Pitt Rivers', 'Lloyd Webber', 'Sebag Montefiore', 'Holmes à Court', 'Holmes a Court',
  'Baron Cohen', 'Strang Steel',
  'Service Company', 'Corporation Company', 'Corporation System', 'Incorporations Limited'
].freeze
NAME_MODIFIERS =
[
  'Al', 'Ap', 'Ben', 'Dell[ae]', 'D[aeiou]', 'De[lrn]', 'D[ao]s', 'El', 'La', 'L[eo]', 'V[ao]n', 'Of', 'San',
  'St[\.]?', 'Zur'
].freeze
APPROXIMATIONS =

Transliterations (like the i18n defaults) see github.com/svenfuchs/i18n/blob/master/lib/i18n/backend/transliterator.rb

{
  'À' => 'A', 'Á' => 'A', 'Â' => 'A', 'Ã' => 'A', 'Ä' => 'A', 'Å' => 'A', 'Æ' => 'AE',
  'Ç' => 'C', 'È' => 'E', 'É' => 'E', 'Ê' => 'E', 'Ë' => 'E', 'Ì' => 'I', 'Í' => 'I',
  'Î' => 'I', 'Ï' => 'I', 'Ð' => 'D', 'Ñ' => 'N', 'Ò' => 'O', 'Ó' => 'O', 'Ô' => 'O',
  'Õ' => 'O', 'Ö' => 'O', '×' => 'x', 'Ø' => 'O', 'Ù' => 'U', 'Ú' => 'U', 'Û' => 'U',
  'Ü' => 'U', 'Ý' => 'Y', 'Þ' => 'Th', 'ß' => 'ss', 'à' => 'a', 'á' => 'a', 'â' => 'a',
  'ã' => 'a', 'ä' => 'a', 'å' => 'a', 'æ' => 'ae', 'ç' => 'c', 'è' => 'e', 'é' => 'e',
  'ê' => 'e', 'ë' => 'e', 'ì' => 'i', 'í' => 'i', 'î' => 'i', 'ï' => 'i', 'ð' => 'd',
  'ñ' => 'n', 'ò' => 'o', 'ó' => 'o', 'ô' => 'o', 'õ' => 'o', 'ö' => 'o', 'ø' => 'o',
  'ù' => 'u', 'ú' => 'u', 'û' => 'u', 'ü' => 'u', 'ý' => 'y', 'þ' => 'th', 'ÿ' => 'y',
  'Ā' => 'A', 'ā' => 'a', 'Ă' => 'A', 'ă' => 'a', 'Ą' => 'A', 'ą' => 'a', 'Ć' => 'C',
  'ć' => 'c', 'Ĉ' => 'C', 'ĉ' => 'c', 'Ċ' => 'C', 'ċ' => 'c', 'Č' => 'C', 'č' => 'c',
  'Ď' => 'D', 'ď' => 'd', 'Đ' => 'D', 'đ' => 'd', 'Ē' => 'E', 'ē' => 'e', 'Ĕ' => 'E',
  'ĕ' => 'e', 'Ė' => 'E', 'ė' => 'e', 'Ę' => 'E', 'ę' => 'e', 'Ě' => 'E', 'ě' => 'e',
  'Ĝ' => 'G', 'ĝ' => 'g', 'Ğ' => 'G', 'ğ' => 'g', 'Ġ' => 'G', 'ġ' => 'g', 'Ģ' => 'G',
  'ģ' => 'g', 'Ĥ' => 'H', 'ĥ' => 'h', 'Ħ' => 'H', 'ħ' => 'h', 'Ĩ' => 'I', 'ĩ' => 'i',
  'Ī' => 'I', 'ī' => 'i', 'Ĭ' => 'I', 'ĭ' => 'i', 'Į' => 'I', 'į' => 'i', 'İ' => 'I',
  'ı' => 'i', 'IJ' => 'IJ', 'ij' => 'ij', 'Ĵ' => 'J', 'ĵ' => 'j', 'Ķ' => 'K', 'ķ' => 'k',
  'ĸ' => 'k', 'Ĺ' => 'L', 'ĺ' => 'l', 'Ļ' => 'L', 'ļ' => 'l', 'Ľ' => 'L', 'ľ' => 'l',
  'Ŀ' => 'L', 'ŀ' => 'l', 'Ł' => 'L', 'ł' => 'l', 'Ń' => 'N', 'ń' => 'n', 'Ņ' => 'N',
  'ņ' => 'n', 'Ň' => 'N', 'ň' => 'n', 'ʼn' => "'n", 'Ŋ' => 'NG', 'ŋ' => 'ng',
  'Ō' => 'O', 'ō' => 'o', 'Ŏ' => 'O', 'ŏ' => 'o', 'Ő' => 'O', 'ő' => 'o', 'Œ' => 'OE',
  'œ' => 'oe', 'Ŕ' => 'R', 'ŕ' => 'r', 'Ŗ' => 'R', 'ŗ' => 'r', 'Ř' => 'R', 'ř' => 'r',
  'Ś' => 'S', 'ś' => 's', 'Ŝ' => 'S', 'ŝ' => 's', 'Ş' => 'S', 'ş' => 's', 'Š' => 'S',
  'š' => 's', 'Ţ' => 'T', 'ţ' => 't', 'Ť' => 'T', 'ť' => 't', 'Ŧ' => 'T', 'ŧ' => 't',
  'Ũ' => 'U', 'ũ' => 'u', 'Ū' => 'U', 'ū' => 'u', 'Ŭ' => 'U', 'ŭ' => 'u', 'Ů' => 'U',
  'ů' => 'u', 'Ű' => 'U', 'ű' => 'u', 'Ų' => 'U', 'ų' => 'u', 'Ŵ' => 'W', 'ŵ' => 'w',
  'Ŷ' => 'Y', 'ŷ' => 'y', 'Ÿ' => 'Y', 'Ź' => 'Z', 'ź' => 'z', 'Ż' => 'Z', 'ż' => 'z',
  'Ž' => 'Z', 'ž' => 'z'
}.freeze
BAD_ENCODING =

When strings are mistakenly encoded as single-byte character sets, instead of UTF-8, there are some distinctive character combinations that we can spot and fix Useful table here www.i18nqa.com/debug/utf8-debug.html

{
  '€' => '', '‚' => '', 'Æ’' => 'ƒ', '„' => '', '…' => '',
  '†' => '', '‡' => '', 'ˆ' => 'ˆ', '‰' => '', 'Å ' => 'Š',
  '‹' => '', 'Å’' => 'Œ', 'Ž' => 'Ž', '‘' => '', '’' => '',
  '“' => '',
  '”' => '', # Note the invisible Ux009D in the key
  '′' => '', # Manually added. Some seem to use this instead of Ux2019
  '•' => '', '–' => '', '—' => '',
  'Ëœ' => '˜', 'â„¢' => '', 'Å¡' => 'š', '›' => '', 'Å“' => 'œ',
  'ž' => 'ž', 'Ÿ' => 'Ÿ', ' ' => ' ', '¡' => '¡', '¢' => '¢',
  '£' => '£', '¤' => '¤', 'Â¥' => '¥', '¦' => '¦', '§' => '§',
  '¨' => '¨', '©' => '©', 'ª' => 'ª', '«' => '«', '¬' => '¬',
  '­' => '­', '®' => '®', '¯' => '¯', '°' => '°', '±' => '±',
  '²' => '²', '³' => '³', '´' => '´', 'µ' => 'µ', '¶' => '',
  '·' => '·', '¸' => '¸', '¹' => '¹', 'º' => 'º', '»' => '»',
  '¼' => '¼', '½' => '½', '¾' => '¾', '¿' => '¿', 'À' => 'À',
  'Ã�' => 'Á', 'Â' => 'Â', 'Ã' => 'Ã', 'Ä' => 'Ä', 'Ã…' => 'Å',
  'Æ' => 'Æ', 'Ç' => 'Ç', 'È' => 'È', 'É' => 'É', 'Ê' => 'Ê',
  'Ë' => 'Ë', 'ÃŒ' => 'Ì', "\xC3\x8D" => 'Í', 'ÃŽ' => 'Î', "\xC3\x8F" => 'Ï',
  "\xC3\x90" => 'Ð', 'Ñ' => 'Ñ', 'Ã’' => 'Ò', 'Ó' => 'Ó', 'Ô' => 'Ô',
  'Õ' => 'Õ', 'Ö' => 'Ö', '×' => '×', 'Ø' => 'Ø', 'Ù' => 'Ù',
  'Ú' => 'Ú', 'Û' => 'Û', 'Ü' => 'Ü', "\xC3\x9D" => 'Ý', 'Þ' => 'Þ',
  'ß' => 'ß', 'à' => 'à', 'á' => 'á', 'â' => 'â', 'ã' => 'ã',
  'ä' => 'ä', 'Ã¥' => 'å', 'æ' => 'æ', 'ç' => 'ç', 'è' => 'è',
  'é' => 'é', 'ê' => 'ê', 'ë' => 'ë', 'ì' => 'ì', 'í' => 'í',
  'î' => 'î', 'ï' => 'ï', 'ð' => 'ð', 'ñ' => 'ñ', 'ò' => 'ò',
  'ó' => 'ó', 'ô' => 'ô', 'õ' => 'õ', 'ö' => 'ö', '÷' => '÷',
  'ø' => 'ø', 'ù' => 'ù', 'ú' => 'ú', 'û' => 'û', 'ü' => 'ü',
  'ý' => 'ý', 'þ' => 'þ', 'ÿ' => 'ÿ',
  "\x00" => '' # Manually added to avoid Bad Argument exception
}.freeze
BAD_ENCODING_PATTERNS =
/(#{BAD_ENCODING.keys.join('|')})/

Instance Method Summary collapse

Instance Method Details

#ansi_attributes(*args) ⇒ Object



276
277
278
# File 'lib/name_tamer/string.rb', line 276

def ansi_attributes(*args)
  "\e[#{args.join(';')}m#{self}\e[0m"
end

#approximate_latin_chars!Object

Any characters that resemble latin characters might usefully be transliterated into ones that are easy to type on an anglophone keyboard.



68
69
70
# File 'lib/name_tamer/string.rb', line 68

def approximate_latin_chars!
  gsub!(/[^\x00-\x7f]/u) { |char| APPROXIMATIONS[char] || char } || self
end

#downcase_after_apostrophe!Object



82
83
84
# File 'lib/name_tamer/string.rb', line 82

def downcase_after_apostrophe!
  gsub!(/\'\w\b/, &:downcase) || self # Lowercase 's
end

#ensure_safe!Object



171
172
173
# File 'lib/name_tamer/string.rb', line 171

def ensure_safe!
  encode!('UTF-8', invalid: :replace, undef: :replace, replace: '')
end

#ensure_space_after_initials!Object



167
168
169
# File 'lib/name_tamer/string.rb', line 167

def ensure_space_after_initials!
  gsub!(/\b([a-z]\.)(?=[a-z0-9]{2,})/i) { |_| "#{Regexp.last_match[1]} " } || self
end

#fix_apostrophe_modifiers!Object



125
126
127
128
129
130
131
# File 'lib/name_tamer/string.rb', line 125

def fix_apostrophe_modifiers!
  %w(Dell D).each do |modifier|
    gsub!(/(.#{modifier}')(\w)/) { |_| "#{Regexp.last_match[1].rstrip.downcase}#{Regexp.last_match[2]}" }
  end

  self # Allows chaining
end

#fix_encoding_errors!Object

Strings that were wrongly encoded with single-byte encodings sometimes have tell-tale substrings that we can put back into the correct UTF-8 character



74
75
76
# File 'lib/name_tamer/string.rb', line 74

def fix_encoding_errors!
  gsub!(BAD_ENCODING_PATTERNS) { |substring| BAD_ENCODING[substring] || substring } || self
end

#fix_ff!Object

Fix ff wierdybonks



103
104
105
106
107
108
109
# File 'lib/name_tamer/string.rb', line 103

def fix_ff!
  %w(
    Fforbes Fforde Ffinch Ffrench Ffoulkes
  ).each { |ff_name| substitute!(ff_name, ff_name.downcase) }

  self # Allows chaining
end

#fix_mac!Object

Our list of terminal characters that indicate a non-celtic name used to include o but we removed it because of MacMurdo.



88
89
90
91
92
93
94
95
96
97
98
99
100
# File 'lib/name_tamer/string.rb', line 88

def fix_mac!
  if self =~ /\bMac[A-Za-z]{2,}[^acizj]\b/ || self =~ /\bMc/
    gsub!(/\b(Ma?c)([A-Za-z]+)/) { |_| Regexp.last_match[1] + Regexp.last_match[2].capitalize }

    # Fix Mac exceptions
    %w(
      MacEdo MacEvicius MacHado MacHar MacHin MacHlin MacIas MacIulis MacKie
      MacKle MacKlin MacKmin MacKmurdo MacQuarie MacLise MacKenzie
    ).each { |mac_name| substitute!(/\b#{mac_name}/, mac_name.capitalize) }
  end

  self # Allows chaining
end

#fix_name_modifiers!Object

Fixes for name modifiers followed by space Also replaces spaces with non-breaking spaces Fixes for name modifiers followed by an apostrophe, e.g. d’Artagnan, Commedia dell’Arte



114
115
116
117
118
119
120
121
122
123
# File 'lib/name_tamer/string.rb', line 114

def fix_name_modifiers!
  NAME_MODIFIERS.each do |modifier|
    gsub!(/((?:[[:space:]]|^)#{modifier})([[:space:]]+|-)/) do |_|
      "#{Regexp.last_match[1].rstrip.downcase}#{Regexp.last_match[2].tr(ASCII_SPACE, NONBREAKING_SPACE)}"
    end
  end

  fix_apostrophe_modifiers!
  self # Allows chaining
end

#fix_separators!(separator) ⇒ Object

Make sure separators are not where they shouldn’t be



53
54
55
56
57
58
59
60
61
62
63
# File 'lib/name_tamer/string.rb', line 53

def fix_separators!(separator)
  return self if separator.nil? || separator.empty?

  r = Regexp.escape(separator)

  # No more than one of the separator in a row.
  substitute!(/#{r}{2,}/, separator)

  # Remove leading/trailing separator.
  substitute!(/^#{r}|#{r}$/i, '')
end

#invalid_chars_to!(separator) ⇒ Object

Change some characters embedded in words to our separator character e.g. example.com -> example-com



30
31
32
# File 'lib/name_tamer/string.rb', line 30

def invalid_chars_to!(separator)
  substitute!(%r{(?<![[:space:]])[\.\/](?![[:space:]])}, separator)
end

#nbsp_in_compound_name!Object

Fix known last names that have spaces (not hyphens!)



141
142
143
144
145
146
147
# File 'lib/name_tamer/string.rb', line 141

def nbsp_in_compound_name!
  COMPOUND_NAMES.each do |compound_name|
    substitute!(compound_name, compound_name.tr(ASCII_SPACE, NONBREAKING_SPACE))
  end

  self # Allows chaining
end

#nbsp_in_name_modifier!Object



149
150
151
152
153
154
155
# File 'lib/name_tamer/string.rb', line 149

def nbsp_in_name_modifier!
  NAME_MODIFIERS.each do |modifier|
    gsub!(/([[:space:]]#{modifier})([[:space:]])/i) { |_| "#{Regexp.last_match[1]}#{NONBREAKING_SPACE}" }
  end

  self # Allows chaining
end

#presenceObject



4
5
6
# File 'lib/name_tamer/string.rb', line 4

def presence
  self unless empty?
end

#remove_periods_from_initials!Object



157
158
159
# File 'lib/name_tamer/string.rb', line 157

def remove_periods_from_initials!
  gsub!(/\b([a-z])\./i) { |_| Regexp.last_match[1] } || self
end

#remove_spaces_from_initials!Object



161
162
163
164
165
# File 'lib/name_tamer/string.rb', line 161

def remove_spaces_from_initials!
  gsub!(/\b([a-z])(\.)* \b(?![a-z0-9'\u00C0-\u00FF]{2,})/i) do |_|
    "#{Regexp.last_match[1]}#{Regexp.last_match[2]}"
  end || self
end

#safe_unescape!Object

Unescape percent-encoded characters This might introduce UTF-8 invalid byte sequence so we take precautions



37
38
39
40
41
42
43
44
45
# File 'lib/name_tamer/string.rb', line 37

def safe_unescape!
  string = URI.unescape(self)
rescue Encoding::CompatibilityError # e.g. "\u2019%80"
  return self
else
  return self if self == string
  replace string
  ensure_safe!
end

#space_around_comma!Object

Ensure commas have exactly one space after them



24
25
26
# File 'lib/name_tamer/string.rb', line 24

def space_around_comma!
  substitute!(/[[:space:]]*,[[:space:]]*/, ', ')
end

#strip_or_self!Object



14
15
16
# File 'lib/name_tamer/string.rb', line 14

def strip_or_self!
  strip! || self
end

#strip_unwanted!(filter) ⇒ Object

Strip illegal characters out completely



10
11
12
# File 'lib/name_tamer/string.rb', line 10

def strip_unwanted!(filter)
  substitute!(filter, '')
end

#substitute!(pattern, replacement) ⇒ Object



175
176
177
# File 'lib/name_tamer/string.rb', line 175

def substitute!(pattern, replacement)
  gsub!(pattern, replacement) || self
end

#unescape_html!Object

Remove HTML entities



48
49
50
# File 'lib/name_tamer/string.rb', line 48

def unescape_html!
  replace CGI.unescapeHTML self
end

#upcase_first_letter!Object



78
79
80
# File 'lib/name_tamer/string.rb', line 78

def upcase_first_letter!
  gsub!(/\b\w/, &:upcase) || self
end

#upcase_initials!Object

Upcase words with no vowels, e.g JPR Williams Except Ng



135
136
137
138
# File 'lib/name_tamer/string.rb', line 135

def upcase_initials!
  gsub!(/\b([bcdfghjklmnpqrstvwxz]+)\b/i) { |_| Regexp.last_match[1].upcase }
  gsub!(/\b(NG)\b/i) { |_| Regexp.last_match[1].capitalize } || self # http://en.wikipedia.org/wiki/Ng
end

#whitespace_to!(separator) ⇒ Object

Change any whitespace into our separator character



19
20
21
# File 'lib/name_tamer/string.rb', line 19

def whitespace_to!(separator)
  substitute!(/[[:space:]]+/, separator)
end