Module: StringUtils

Extended by:
StringUtils
Included in:
StringUtils
Defined in:
lib/string_utils.rb,
lib/string_utils/version.rb,
lib/string_utils/transliteration.rb

Overview

StringUtils is a library that provides various handy string manipulation methods Example usage:

* StringUtils.truncate("hello world", 10, "...") #=> "hello..."
* StringUtils.normalize_name "\302\240  Gran Via/Avda.de Asturias " #=> :Gran Via / Avda. de Asturias"
* StringUtils.urlify("waßer") #=> "wasser"
* StringUtils.normalize_punctuation(" , a,,b ,") #=> "a, b"

Constant Summary collapse

NBSP =
"\302\240"
WHITESPACE_MATCHER =
"(?:\s|#{NBSP})"
WHITESPACE =
/#{WHITESPACE_MATCHER}/
NOT_WHITESPACE =
"[^\s#{NBSP}]"
WHITESPACES =
/#{WHITESPACE_MATCHER}+/
VERSION =
"1.0.8"
TRANSLITERATIONS =

Based on transliteration table from i18n v0.5.0

{
    # Latin
    "À" =>"A", "Á"=>"A", "Â"=>"A", "Ã"=>"A", "Ä"=>"A", "Å"=>"A", "Æ"=>"AE",
    "Ç" =>"C", "È"=>"E", "É"=>"E", "Ê"=>"E", "Ë"=>"E", "Ì"=>"I", "Í"=>"I",
    "Î" =>"I", "Ï"=>"I", "Ð"=>"D", "Ñ"=>"N", "Ò"=>"O", "Ó"=>"O", "Ô"=>"O",
    "Õ" =>"O", "Ö"=>"O", "×"=>"x", "Ø"=>"O", "Ù"=>"U", "Ú"=>"U", "Û"=>"U",
    "Ü" =>"U", "Ý"=>"Y", "Þ"=>"Th", "ß"=>"ss", "à"=>"a", "á"=>"a", "â"=>"a",
    "ã" =>"a", "ä"=>"a", "å"=>"a", "æ"=>"ae", "ç"=>"c", "è"=>"e", "é"=>"e",
    "ê" =>"e", "ë"=>"e", "ì"=>"i", "í"=>"i", "î"=>"i", "ï"=>"i", "ð"=>"d",
    "ñ" =>"n", "ò"=>"o", "ó"=>"o", "ô"=>"o", "õ"=>"o", "ö"=>"o", "ø"=>"o",
    "ù" =>"u", "ú"=>"u", "û"=>"u", "ü"=>"u", "ý"=>"y", "þ"=>"th", "ÿ"=>"y",
    "Ā" =>"A", "ā"=>"a", "Ă"=>"A", "ă"=>"a", "Ą"=>"A", "ą"=>"a", "Ć"=>"C",
    "ć" =>"c", "Ĉ"=>"C", "ĉ"=>"c", "Ċ"=>"C", "ċ"=>"c", "Č"=>"C", "č"=>"c",
    "Ď" =>"D", "ď"=>"d", "Đ"=>"D", "đ"=>"d", "Ē"=>"E", "ē"=>"e", "Ĕ"=>"E",
    "ĕ" =>"e", "Ė"=>"E", "ė"=>"e", "Ę"=>"E", "ę"=>"e", "Ě"=>"E", "ě"=>"e",
    "Ĝ" =>"G", "ĝ"=>"g", "Ğ"=>"G", "ğ"=>"g", "Ġ"=>"G", "ġ"=>"g", "Ģ"=>"G",
    "ģ" =>"g", "Ĥ"=>"H", "ĥ"=>"h", "Ħ"=>"H", "ħ"=>"h", "Ĩ"=>"I", "ĩ"=>"i",
    "Ī" =>"I", "ī"=>"i", "Ĭ"=>"I", "ĭ"=>"i", "Į"=>"I", "į"=>"i", "İ"=>"I",
    "ı" =>"i", "IJ"=>"IJ", "ij"=>"ij", "Ĵ"=>"J", "ĵ"=>"j", "Ķ"=>"K", "ķ"=>"k",
    "ĸ" =>"k", "Ĺ"=>"L", "ĺ"=>"l", "Ļ"=>"L", "ļ"=>"l", "Ľ"=>"L", "ľ"=>"l",
    "Ŀ" =>"L", "ŀ"=>"l", "Ł"=>"L", "ł"=>"l", "Ń"=>"N", "ń"=>"n", "Ņ"=>"N",
    "ņ" =>"n", "Ň"=>"N", "ň"=>"n", "ʼn"=>"'n", "Ŋ"=>"NG", "ŋ"=>"ng",
    "Ō" =>"O", "ō"=>"o", "Ŏ"=>"O", "ŏ"=>"o", "Ő"=>"O", "ő"=>"o", "Œ"=>"OE",
    "œ" =>"oe", "Ŕ"=>"R", "ŕ"=>"r", "Ŗ"=>"R", "ŗ"=>"r", "Ř"=>"R", "ř"=>"r",
    "Ś" =>"S", "ś"=>"s", "Ŝ"=>"S", "ŝ"=>"s", "Ş"=>"S", "ş"=>"s", "Š"=>"S",
    "š" =>"s", "Ţ"=>"T", "ţ"=>"t", "Ť"=>"T", "ť"=>"t", "Ŧ"=>"T", "ŧ"=>"t",
    "Ũ" =>"U", "ũ"=>"u", "Ū"=>"U", "ū"=>"u", "Ŭ"=>"U", "ŭ"=>"u", "Ů"=>"U",
    "ů" =>"u", "Ű"=>"U", "ű"=>"u", "Ų"=>"U", "ų"=>"u", "Ŵ"=>"W", "ŵ"=>"w",
    "Ŷ" =>"Y", "ŷ"=>"y", "Ÿ"=>"Y", "Ź"=>"Z", "ź"=>"z", "Ż"=>"Z", "ż"=>"z",
    "Ž" =>"Z", "ž"=>"z",

    # Cyrillic
    "Ґ" =>"G", "Ё"=>"YO", "Є"=>"E", "Ї"=>"YI", "І"=>"I",
    "А" =>"A", "Б"=>"B", "В"=>"V", "Г"=>"G",
    "Д" =>"D", "Е"=>"E", "Ж"=>"ZH", "З"=>"Z", "И"=>"I",
    "Й" =>"Y", "К"=>"K", "Л"=>"L", "М"=>"M", "Н"=>"N",
    "О" =>"O", "П"=>"P", "Р"=>"R", "С"=>"S", "Т"=>"T",
    "У" =>"U", "Ф"=>"F", "Х"=>"H", "Ц"=>"TS", "Ч"=>"CH",
    "Ш" =>"SH", "Щ"=>"SCH", "Ъ"=>"'", "Ы"=>"Y", "Ь"=>"",
    "Э" =>"E", "Ю"=>"YU", "Я"=>"YA", "і"=>"i",
    "ґ" =>"g", "ё"=>"yo", ""=>"#", "є"=>"e",
    "ї" =>"yi", "а"=>"a", "б"=>"b",
    "в" =>"v", "г"=>"g", "д"=>"d", "е"=>"e", "ж"=>"zh",
    "з" =>"z", "и"=>"i", "й"=>"y", "к"=>"k", "л"=>"l",
    "м" =>"m", "н"=>"n", "о"=>"o", "п"=>"p", "р"=>"r",
    "с" =>"s", "т"=>"t", "у"=>"u", "ф"=>"f", "х"=>"h",
    "ц" =>"ts", "ч"=>"ch", "ш"=>"sh", "щ"=>"sch", "ъ"=>"'",
    "ы" =>"y", "ь"=>"", "э"=>"e", "ю"=>"yu", "я"=>"ya",

    # Greek
    'α' => 'a',
    'η' => 'h',
    'ν' => 'n',
    'τ' => 't',
    'β' => 'b',
    'θ' => 'th',
    'ξ' => 'x',
    'υ' => 'y',
    'γ' => 'g',
    'ι' => 'i',
    'ο' => 'o',
    'φ' => 'f',
    'δ' => 'd',
    'κ' => 'k',
    'π' => 'p',
    'χ' => 'ch',
    'ε' => 'e',
    'λ' => 'l',
    'ρ' => 'r',
    'ψ' => 'ps',
    'ζ' => 'z',
    'μ' => 'm',
    'σ' => 's',
    'ω' => 'w',
    'Θ' => 'Th',
    'Ξ' => 'X',
    'Γ' => 'G',
    'Φ' => 'F',
    'Δ' => 'D',
    'Π' => 'P',
    'Λ' => 'L',
    'Ρ' => 'R',
    'Ψ' => 'Ps',
    'Σ' => 'S',
    'Ω' => 'W'
}

Instance Method Summary collapse

Instance Method Details

#mb_charify(text) ⇒ Object

Returns a unicode compatible version of the string

support any of:

* ruby 1.9 sane utf8 support
* rails 2.1 workaround for crappy ruby 1.8 utf8 support
* rails 2.2 workaround for crappy ruby 1.8 utf8 support

hooray!



166
167
168
169
170
171
172
173
174
# File 'lib/string_utils.rb', line 166

def mb_charify(text)
  if RUBY_VERSION >= '1.9'
    text.dup
  elsif text.respond_to?(:mb_chars)
    text.frozen? ? text.dup.mb_chars : text.mb_chars
  else
    raise "StringUtils: No unicode support for strings"
  end
end

#normalize_name(value, options = {}) ⇒ Object

Normalizes whitespace “a , a” => “a, a” “a ,a” => “a, a” “a,a” => “a, a” “a/b” => “a / b”, “a/ b” => “a / b”, “a /b” => “a / b” Removes trailing and leading [.,] options: => true (default false)



73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# File 'lib/string_utils.rb', line 73

def normalize_name(value, options = {})
  value = mb_charify(value)

  # Normalize whitespace
  value.gsub!("\n", ' ')
  value.gsub!(WHITESPACES, ' ')
  value.strip!

  # Remove trailing and leading .,
  value.gsub!(/^[.,]/, '')
  value.gsub!(/[.,]$/, '')

  # Remove quote pairs. Imperfect, but good enough
  value.gsub!(/\A['"]+(.*)['"]+\z/, '\1')

  # "a ,a"  => "a, a"
  # "a,a"   => "a, a"
  # "a , a" => "a, a"
  value.gsub!(/#{WHITESPACE_MATCHER}([,.])/, '\1')
  value.gsub!(/([,.])(#{NOT_WHITESPACE})/, '\1 \2')

  # "//" => "/"
  value.gsub!(/\/+/, '/')

  # "a/b" => "a / b", "a/ b" => "a / b", "a /b" => "a / b"
  value.gsub!(/(#{NOT_WHITESPACE})\//, '\1 /')
  value.gsub!(/\/(#{NOT_WHITESPACE})/, '/ \1')

  if options[:titleize]
    value = value.titleize
    value.gsub!(/#{WHITESPACE_MATCHER}(Of|And|A|An|The|To)#{WHITESPACE_MATCHER}/) { |m| "#{m.downcase}" }
  end
  value.to_s
end

#normalize_punctuation(str) ⇒ Object

Collapses spaces and commas Fixes spacing around the following characters:

,.;:&

Removes consecutive character dupes Removes trailing and leading commas



31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# File 'lib/string_utils.rb', line 31

def normalize_punctuation(str)
  s = str.dup
  s.gsub! /\s+/, ' '

  s.gsub(/\s*&,/)

  # Collapse w/s around all
  s.gsub! /\s*([:,&.;])\s*/, '\1'
  # Collapse consecutive dupes
  s.gsub! /([.,;&:])+/ , '\1'

  # Collapse leading and trailing punctuation
  s.gsub! /^\s*[,:&;.]|[.;&:,]\s*$/, ''

  # Add whitespaces
  s.gsub! /([,.;:])(\S)/, '\1 \2'
  s.gsub! /(\S)([&])(\S)/, '\1 \2 \3'

  s.strip!
  s
end

#truncate(text, *args) ⇒ Object

Truncates the string The result will be :length or shorter, and the words will not be cut in the middle Arguments: :length => Integer (default: 30) :omission => String (default: “…”)



113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# File 'lib/string_utils.rb', line 113

def truncate(text, *args)
  options = args.last.is_a?(Hash) ? args.pop : {}

  # support either old or Rails 2.2 calling convention:
  unless args.empty?
    options[:length]   = args[0] || 30
    options[:omission] = args[1] || ""
  end

  options          = {:length => 30, :omission => ""}.merge(options)
  options[:length] = options[:length].to_i

  return "" if !text
  chars = mb_charify(text)


  # If we can return it straight away or rstrip it and return it, we do it here
  if chars.length <= options[:length]
    return text
  elsif (chars = rstrip_with_nbsp(chars)).length <= options[:length]
    return chars.to_s
  end

  omission           = mb_charify(options[:omission])

  # Here we know we have to remove at least 1 word
  # 1. Get the first l characters
  # 2. Remove the last word if it's a part
  # 3. Add omission
  length_wo_omission = options[:length] - omission.length

  return '' if length_wo_omission < 0

  result = rstrip_with_nbsp(chars[0...length_wo_omission] || "")

  # Remove the last word unless we happened to trim it exactly already
  unless chars[length_wo_omission] =~ WHITESPACE || result.length < length_wo_omission
    len    = result.split(WHITESPACES).last
    len    &&= len.length
    result = rstrip_with_nbsp(result[0...(result.length - (len || 0))])
  end

  result += options[:omission]
  result.to_s
end

#urlify(string, opts = {}) ⇒ Object

Converts a string to a nicely readable URL opts: :default_replacement – string to use for unknown characters (Default: “”) :whitespace_replacement – string to use to replace whitespace+ (Default: “-”)



57
58
59
60
61
62
63
64
# File 'lib/string_utils.rb', line 57

def urlify(string, opts = {})
  opts = {:whitespace_replacement => '-', :default_replacement => ""}.merge(opts)
  string = string.gsub(WHITESPACES, opts[:whitespace_replacement])
  string.strip!
  string.gsub!(/[^\x00-\x7f]/u) { |char| TRANSLITERATIONS[char] || opts[:default_replacement] }
  string.gsub!(/[^a-z0-9\-+_]/, opts[:default_replacement])
  string
end