Class: String

Inherits:

Object

Object
String

show all

Defined in:: lib/name_tamer/string.rb

Constant Summary collapse

NONBREAKING_SPACE =

"\u00a0".freeze

ASCII_SPACE =

' '.freeze

COMPOUND_NAMES =

[
  'Lane Fox', 'Bonham Carter', 'Pitt Rivers', 'Lloyd Webber', 'Sebag Montefiore', 'Holmes à Court', 'Holmes a Court',
  'Baron Cohen', 'Strang Steel',
  'Service Company', 'Corporation Company', 'Corporation System', 'Incorporations Limited'
].freeze

NAME_MODIFIERS =

[
  'Al', 'Ap', 'Ben', 'Dell[ae]', 'D[aeiou]', 'De[lrn]', 'D[ao]s', 'El', 'La', 'L[eo]', 'V[ao]n', 'Of', 'San',
  'St[\.]?', 'Zur'
].freeze

APPROXIMATIONS = Transliterations (like the i18n defaults) see github.com/svenfuchs/i18n/blob/master/lib/i18n/backend/transliterator.rb

{
  'À' => 'A', 'Á' => 'A', 'Â' => 'A', 'Ã' => 'A', 'Ä' => 'A', 'Å' => 'A', 'Æ' => 'AE',
  'Ç' => 'C', 'È' => 'E', 'É' => 'E', 'Ê' => 'E', 'Ë' => 'E', 'Ì' => 'I', 'Í' => 'I',
  'Î' => 'I', 'Ï' => 'I', 'Ð' => 'D', 'Ñ' => 'N', 'Ò' => 'O', 'Ó' => 'O', 'Ô' => 'O',
  'Õ' => 'O', 'Ö' => 'O', '×' => 'x', 'Ø' => 'O', 'Ù' => 'U', 'Ú' => 'U', 'Û' => 'U',
  'Ü' => 'U', 'Ý' => 'Y', 'Þ' => 'Th', 'ß' => 'ss', 'à' => 'a', 'á' => 'a', 'â' => 'a',
  'ã' => 'a', 'ä' => 'a', 'å' => 'a', 'æ' => 'ae', 'ç' => 'c', 'è' => 'e', 'é' => 'e',
  'ê' => 'e', 'ë' => 'e', 'ì' => 'i', 'í' => 'i', 'î' => 'i', 'ï' => 'i', 'ð' => 'd',
  'ñ' => 'n', 'ò' => 'o', 'ó' => 'o', 'ô' => 'o', 'õ' => 'o', 'ö' => 'o', 'ø' => 'o',
  'ù' => 'u', 'ú' => 'u', 'û' => 'u', 'ü' => 'u', 'ý' => 'y', 'þ' => 'th', 'ÿ' => 'y',
  'Ā' => 'A', 'ā' => 'a', 'Ă' => 'A', 'ă' => 'a', 'Ą' => 'A', 'ą' => 'a', 'Ć' => 'C',
  'ć' => 'c', 'Ĉ' => 'C', 'ĉ' => 'c', 'Ċ' => 'C', 'ċ' => 'c', 'Č' => 'C', 'č' => 'c',
  'Ď' => 'D', 'ď' => 'd', 'Đ' => 'D', 'đ' => 'd', 'Ē' => 'E', 'ē' => 'e', 'Ĕ' => 'E',
  'ĕ' => 'e', 'Ė' => 'E', 'ė' => 'e', 'Ę' => 'E', 'ę' => 'e', 'Ě' => 'E', 'ě' => 'e',
  'Ĝ' => 'G', 'ĝ' => 'g', 'Ğ' => 'G', 'ğ' => 'g', 'Ġ' => 'G', 'ġ' => 'g', 'Ģ' => 'G',
  'ģ' => 'g', 'Ĥ' => 'H', 'ĥ' => 'h', 'Ħ' => 'H', 'ħ' => 'h', 'Ĩ' => 'I', 'ĩ' => 'i',
  'Ī' => 'I', 'ī' => 'i', 'Ĭ' => 'I', 'ĭ' => 'i', 'Į' => 'I', 'į' => 'i', 'İ' => 'I',
  'ı' => 'i', 'Ĳ' => 'IJ', 'ĳ' => 'ij', 'Ĵ' => 'J', 'ĵ' => 'j', 'Ķ' => 'K', 'ķ' => 'k',
  'ĸ' => 'k', 'Ĺ' => 'L', 'ĺ' => 'l', 'Ļ' => 'L', 'ļ' => 'l', 'Ľ' => 'L', 'ľ' => 'l',
  'Ŀ' => 'L', 'ŀ' => 'l', 'Ł' => 'L', 'ł' => 'l', 'Ń' => 'N', 'ń' => 'n', 'Ņ' => 'N',
  'ņ' => 'n', 'Ň' => 'N', 'ň' => 'n', 'ŉ' => "'n", 'Ŋ' => 'NG', 'ŋ' => 'ng',
  'Ō' => 'O', 'ō' => 'o', 'Ŏ' => 'O', 'ŏ' => 'o', 'Ő' => 'O', 'ő' => 'o', 'Œ' => 'OE',
  'œ' => 'oe', 'Ŕ' => 'R', 'ŕ' => 'r', 'Ŗ' => 'R', 'ŗ' => 'r', 'Ř' => 'R', 'ř' => 'r',
  'Ś' => 'S', 'ś' => 's', 'Ŝ' => 'S', 'ŝ' => 's', 'Ş' => 'S', 'ş' => 's', 'Š' => 'S',
  'š' => 's', 'Ţ' => 'T', 'ţ' => 't', 'Ť' => 'T', 'ť' => 't', 'Ŧ' => 'T', 'ŧ' => 't',
  'Ũ' => 'U', 'ũ' => 'u', 'Ū' => 'U', 'ū' => 'u', 'Ŭ' => 'U', 'ŭ' => 'u', 'Ů' => 'U',
  'ů' => 'u', 'Ű' => 'U', 'ű' => 'u', 'Ų' => 'U', 'ų' => 'u', 'Ŵ' => 'W', 'ŵ' => 'w',
  'Ŷ' => 'Y', 'ŷ' => 'y', 'Ÿ' => 'Y', 'Ź' => 'Z', 'ź' => 'z', 'Ż' => 'Z', 'ż' => 'z',
  'Ž' => 'Z', 'ž' => 'z'
}.freeze

BAD_ENCODING = When strings are mistakenly encoded as single-byte character sets, instead of UTF-8, there are some distinctive character combinations that we can spot and fix Useful table here www.i18nqa.com/debug/utf8-debug.html

{
  'â‚¬' => '€', 'â€š' => '‚', 'Æ’' => 'ƒ', 'â€ž' => '„', 'â€¦' => '…',
  'â€ ' => '†', 'â€¡' => '‡', 'Ë†' => 'ˆ', 'â€°' => '‰', 'Å ' => 'Š',
  'â€¹' => '‹', 'Å’' => 'Œ', 'Å½' => 'Ž', 'â€˜' => '‘', 'â€™' => '’',
  'â€œ' => '“',
  'â€' => '”', # Note the invisible Ux009D in the key
  'â€²' => '′', # Manually added. Some seem to use this instead of Ux2019
  'â€¢' => '•', 'â€“' => '–', 'â€”' => '—',
  'Ëœ' => '˜', 'â„¢' => '™', 'Å¡' => 'š', 'â€º' => '›', 'Å“' => 'œ',
  'Å¾' => 'ž', 'Å¸' => 'Ÿ', 'Â ' => ' ', 'Â¡' => '¡', 'Â¢' => '¢',
  'Â£' => '£', 'Â¤' => '¤', 'Â¥' => '¥', 'Â¦' => '¦', 'Â§' => '§',
  'Â¨' => '¨', 'Â©' => '©', 'Âª' => 'ª', 'Â«' => '«', 'Â¬' => '¬',
  'Â' => '', 'Â®' => '®', 'Â¯' => '¯', 'Â°' => '°', 'Â±' => '±',
  'Â²' => '²', 'Â³' => '³', 'Â´' => '´', 'Âµ' => 'µ', 'Â¶' => '¶',
  'Â·' => '·', 'Â¸' => '¸', 'Â¹' => '¹', 'Âº' => 'º', 'Â»' => '»',
  'Â¼' => '¼', 'Â½' => '½', 'Â¾' => '¾', 'Â¿' => '¿', 'Ã€' => 'À',
  'Ã�' => 'Á', 'Ã‚' => 'Â', 'Ãƒ' => 'Ã', 'Ã„' => 'Ä', 'Ã…' => 'Å',
  'Ã†' => 'Æ', 'Ã‡' => 'Ç', 'Ãˆ' => 'È', 'Ã‰' => 'É', 'ÃŠ' => 'Ê',
  'Ã‹' => 'Ë', 'ÃŒ' => 'Ì', "\xC3\x8D" => 'Í', 'ÃŽ' => 'Î', "\xC3\x8F" => 'Ï',
  "\xC3\x90" => 'Ð', 'Ã‘' => 'Ñ', 'Ã’' => 'Ò', 'Ã“' => 'Ó', 'Ã”' => 'Ô',
  'Ã•' => 'Õ', 'Ã–' => 'Ö', 'Ã—' => '×', 'Ã˜' => 'Ø', 'Ã™' => 'Ù',
  'Ãš' => 'Ú', 'Ã›' => 'Û', 'Ãœ' => 'Ü', "\xC3\x9D" => 'Ý', 'Ãž' => 'Þ',
  'ÃŸ' => 'ß', 'Ã ' => 'à', 'Ã¡' => 'á', 'Ã¢' => 'â', 'Ã£' => 'ã',
  'Ã¤' => 'ä', 'Ã¥' => 'å', 'Ã¦' => 'æ', 'Ã§' => 'ç', 'Ã¨' => 'è',
  'Ã©' => 'é', 'Ãª' => 'ê', 'Ã«' => 'ë', 'Ã¬' => 'ì', 'Ã' => 'í',
  'Ã®' => 'î', 'Ã¯' => 'ï', 'Ã°' => 'ð', 'Ã±' => 'ñ', 'Ã²' => 'ò',
  'Ã³' => 'ó', 'Ã´' => 'ô', 'Ãµ' => 'õ', 'Ã¶' => 'ö', 'Ã·' => '÷',
  'Ã¸' => 'ø', 'Ã¹' => 'ù', 'Ãº' => 'ú', 'Ã»' => 'û', 'Ã¼' => 'ü',
  'Ã½' => 'ý', 'Ã¾' => 'þ', 'Ã¿' => 'ÿ',
  "\x00" => '' # Manually added to avoid Bad Argument exception
}.freeze

BAD_ENCODING_PATTERNS =

/(#{BAD_ENCODING.keys.join('|')})/

Instance Method Summary collapse

#ansi_attributes(*args) ⇒ Object
#approximate_latin_chars! ⇒ Object

Any characters that resemble latin characters might usefully be transliterated into ones that are easy to type on an anglophone keyboard.
#downcase_after_apostrophe! ⇒ Object
#ensure_safe! ⇒ Object
#ensure_space_after_initials! ⇒ Object
#fix_apostrophe_modifiers! ⇒ Object
#fix_encoding_errors! ⇒ Object

Strings that were wrongly encoded with single-byte encodings sometimes have tell-tale substrings that we can put back into the correct UTF-8 character.
#fix_ff! ⇒ Object

Fix ff wierdybonks.
#fix_mac! ⇒ Object

Our list of terminal characters that indicate a non-celtic name used to include o but we removed it because of MacMurdo.
#fix_name_modifiers! ⇒ Object

Fixes for name modifiers followed by space Also replaces spaces with non-breaking spaces Fixes for name modifiers followed by an apostrophe, e.g.
#fix_separators!(separator) ⇒ Object

Make sure separators are not where they shouldn’t be.
#invalid_chars_to!(separator) ⇒ Object

Change some characters embedded in words to our separator character e.g.
#nbsp_in_compound_name! ⇒ Object

Fix known last names that have spaces (not hyphens!).
#nbsp_in_name_modifier! ⇒ Object
#presence ⇒ Object
#remove_periods_from_initials! ⇒ Object
#remove_spaces_from_initials! ⇒ Object
#safe_unescape! ⇒ Object

Unescape percent-encoded characters This might introduce UTF-8 invalid byte sequence so we take precautions.
#space_around_comma! ⇒ Object

Ensure commas have exactly one space after them.
#strip_or_self! ⇒ Object
#strip_unwanted!(filter) ⇒ Object

Strip illegal characters out completely.
#substitute!(pattern, replacement) ⇒ Object
#unescape_html! ⇒ Object

Remove HTML entities.
#upcase_first_letter! ⇒ Object
#upcase_initials! ⇒ Object

Upcase words with no vowels, e.g JPR Williams Except Ng.
#whitespace_to!(separator) ⇒ Object

Change any whitespace into our separator character.

Instance Method Details

#ansi_attributes(*args) ⇒ `Object`



276
277
278

# File 'lib/name_tamer/string.rb', line 276

def ansi_attributes(*args)
  "\e[#{args.join(';')}m#{self}\e[0m"
end

#approximate_latin_chars! ⇒ `Object`

Any characters that resemble latin characters might usefully be transliterated into ones that are easy to type on an anglophone keyboard.



68
69
70

# File 'lib/name_tamer/string.rb', line 68

def approximate_latin_chars!
  gsub!(/[^\x00-\x7f]/u) { |char| APPROXIMATIONS[char] || char } || self
end

#downcase_after_apostrophe! ⇒ `Object`



82
83
84

# File 'lib/name_tamer/string.rb', line 82

def downcase_after_apostrophe!
  gsub!(/\'\w\b/, &:downcase) || self # Lowercase 's
end

#ensure_safe! ⇒ `Object`



171
172
173

# File 'lib/name_tamer/string.rb', line 171

def ensure_safe!
  encode!('UTF-8', invalid: :replace, undef: :replace, replace: '')
end

#ensure_space_after_initials! ⇒ `Object`



167
168
169

# File 'lib/name_tamer/string.rb', line 167

def ensure_space_after_initials!
  gsub!(/\b([a-z]\.)(?=[a-z0-9]{2,})/i) { |_| "#{Regexp.last_match[1]} " } || self
end

#fix_apostrophe_modifiers! ⇒ `Object`

# File 'lib/name_tamer/string.rb', line 125

def fix_apostrophe_modifiers!
  %w(Dell D).each do |modifier|
    gsub!(/(.#{modifier}')(\w)/) { |_| "#{Regexp.last_match[1].rstrip.downcase}#{Regexp.last_match[2]}" }
  end

  self # Allows chaining
end

#fix_encoding_errors! ⇒ `Object`

Strings that were wrongly encoded with single-byte encodings sometimes have tell-tale substrings that we can put back into the correct UTF-8 character



74
75
76

# File 'lib/name_tamer/string.rb', line 74

def fix_encoding_errors!
  gsub!(BAD_ENCODING_PATTERNS) { |substring| BAD_ENCODING[substring] || substring } || self
end

#fix_ff! ⇒ `Object`

Fix ff wierdybonks

# File 'lib/name_tamer/string.rb', line 103

def fix_ff!
  %w(
    Fforbes Fforde Ffinch Ffrench Ffoulkes
  ).each { |ff_name| substitute!(ff_name, ff_name.downcase) }

  self # Allows chaining
end

#fix_mac! ⇒ `Object`

Our list of terminal characters that indicate a non-celtic name used to include o but we removed it because of MacMurdo.

# File 'lib/name_tamer/string.rb', line 88

def fix_mac!
  if self =~ /\bMac[A-Za-z]{2,}[^acizj]\b/ || self =~ /\bMc/
    gsub!(/\b(Ma?c)([A-Za-z]+)/) { |_| Regexp.last_match[1] + Regexp.last_match[2].capitalize }

    # Fix Mac exceptions
    %w(
      MacEdo MacEvicius MacHado MacHar MacHin MacHlin MacIas MacIulis MacKie
      MacKle MacKlin MacKmin MacKmurdo MacQuarie MacLise MacKenzie
    ).each { |mac_name| substitute!(/\b#{mac_name}/, mac_name.capitalize) }
  end

  self # Allows chaining
end

#fix_name_modifiers! ⇒ `Object`

Fixes for name modifiers followed by space Also replaces spaces with non-breaking spaces Fixes for name modifiers followed by an apostrophe, e.g. d’Artagnan, Commedia dell’Arte

# File 'lib/name_tamer/string.rb', line 114

def fix_name_modifiers!
  NAME_MODIFIERS.each do |modifier|
    gsub!(/((?:[[:space:]]|^)#{modifier})([[:space:]]+|-)/) do |_|
      "#{Regexp.last_match[1].rstrip.downcase}#{Regexp.last_match[2].tr(ASCII_SPACE, NONBREAKING_SPACE)}"
    end
  end

  fix_apostrophe_modifiers!
  self # Allows chaining
end

#fix_separators!(separator) ⇒ `Object`

Make sure separators are not where they shouldn’t be

# File 'lib/name_tamer/string.rb', line 53

def fix_separators!(separator)
  return self if separator.nil? || separator.empty?

  r = Regexp.escape(separator)

  # No more than one of the separator in a row.
  substitute!(/#{r}{2,}/, separator)

  # Remove leading/trailing separator.
  substitute!(/^#{r}|#{r}$/i, '')
end

#invalid_chars_to!(separator) ⇒ `Object`

Change some characters embedded in words to our separator character e.g. example.com -> example-com



30
31
32

# File 'lib/name_tamer/string.rb', line 30

def invalid_chars_to!(separator)
  substitute!(%r{(?<![[:space:]])[\.\/](?![[:space:]])}, separator)
end

#nbsp_in_compound_name! ⇒ `Object`

Fix known last names that have spaces (not hyphens!)

# File 'lib/name_tamer/string.rb', line 141

def nbsp_in_compound_name!
  COMPOUND_NAMES.each do |compound_name|
    substitute!(compound_name, compound_name.tr(ASCII_SPACE, NONBREAKING_SPACE))
  end

  self # Allows chaining
end

#nbsp_in_name_modifier! ⇒ `Object`

# File 'lib/name_tamer/string.rb', line 149

def nbsp_in_name_modifier!
  NAME_MODIFIERS.each do |modifier|
    gsub!(/([[:space:]]#{modifier})([[:space:]])/i) { |_| "#{Regexp.last_match[1]}#{NONBREAKING_SPACE}" }
  end

  self # Allows chaining
end

#presence ⇒ `Object`



4
5
6

# File 'lib/name_tamer/string.rb', line 4

def presence
  self unless empty?
end

#remove_periods_from_initials! ⇒ `Object`



157
158
159

# File 'lib/name_tamer/string.rb', line 157

def remove_periods_from_initials!
  gsub!(/\b([a-z])\./i) { |_| Regexp.last_match[1] } || self
end

#remove_spaces_from_initials! ⇒ `Object`

# File 'lib/name_tamer/string.rb', line 161

def remove_spaces_from_initials!
  gsub!(/\b([a-z])(\.)* \b(?![a-z0-9'\u00C0-\u00FF]{2,})/i) do |_|
    "#{Regexp.last_match[1]}#{Regexp.last_match[2]}"
  end || self
end

#safe_unescape! ⇒ `Object`

Unescape percent-encoded characters This might introduce UTF-8 invalid byte sequence so we take precautions

# File 'lib/name_tamer/string.rb', line 37

def safe_unescape!
  string = URI.unescape(self)
rescue Encoding::CompatibilityError # e.g. "\u2019%80"
  return self
else
  return self if self == string
  replace string
  ensure_safe!
end

#space_around_comma! ⇒ `Object`

Ensure commas have exactly one space after them



24
25
26

# File 'lib/name_tamer/string.rb', line 24

def space_around_comma!
  substitute!(/[[:space:]]*,[[:space:]]*/, ', ')
end

#strip_or_self! ⇒ `Object`



14
15
16

# File 'lib/name_tamer/string.rb', line 14

def strip_or_self!
  strip! || self
end

#strip_unwanted!(filter) ⇒ `Object`

Strip illegal characters out completely



10
11
12

# File 'lib/name_tamer/string.rb', line 10

def strip_unwanted!(filter)
  substitute!(filter, '')
end

#substitute!(pattern, replacement) ⇒ `Object`



175
176
177

# File 'lib/name_tamer/string.rb', line 175

def substitute!(pattern, replacement)
  gsub!(pattern, replacement) || self
end

#unescape_html! ⇒ `Object`

Remove HTML entities



48
49
50

# File 'lib/name_tamer/string.rb', line 48

def unescape_html!
  replace CGI.unescapeHTML self
end

#upcase_first_letter! ⇒ `Object`



78
79
80

# File 'lib/name_tamer/string.rb', line 78

def upcase_first_letter!
  gsub!(/\b\w/, &:upcase) || self
end

#upcase_initials! ⇒ `Object`

Upcase words with no vowels, e.g JPR Williams Except Ng

# File 'lib/name_tamer/string.rb', line 135

def upcase_initials!
  gsub!(/\b([bcdfghjklmnpqrstvwxz]+)\b/i) { |_| Regexp.last_match[1].upcase }
  gsub!(/\b(NG)\b/i) { |_| Regexp.last_match[1].capitalize } || self # http://en.wikipedia.org/wiki/Ng
end

#whitespace_to!(separator) ⇒ `Object`

Change any whitespace into our separator character



19
20
21

# File 'lib/name_tamer/string.rb', line 19

def whitespace_to!(separator)
  substitute!(/[[:space:]]+/, separator)
end

Class: String

Constant Summary collapse

Instance Method Summary collapse

Instance Method Details

#ansi_attributes(*args) ⇒ Object

#approximate_latin_chars! ⇒ Object

#downcase_after_apostrophe! ⇒ Object

#ensure_safe! ⇒ Object

#ensure_space_after_initials! ⇒ Object

#fix_apostrophe_modifiers! ⇒ Object

#fix_encoding_errors! ⇒ Object

#fix_ff! ⇒ Object

#fix_mac! ⇒ Object

#fix_name_modifiers! ⇒ Object

#fix_separators!(separator) ⇒ Object

#invalid_chars_to!(separator) ⇒ Object

#nbsp_in_compound_name! ⇒ Object

#nbsp_in_name_modifier! ⇒ Object

#presence ⇒ Object

#remove_periods_from_initials! ⇒ Object

#remove_spaces_from_initials! ⇒ Object

#safe_unescape! ⇒ Object

#space_around_comma! ⇒ Object

#strip_or_self! ⇒ Object

#strip_unwanted!(filter) ⇒ Object

#substitute!(pattern, replacement) ⇒ Object

#unescape_html! ⇒ Object

#upcase_first_letter! ⇒ Object

#upcase_initials! ⇒ Object

#whitespace_to!(separator) ⇒ Object