Module: StringUtils

Extended by:
StringUtils
Included in:
StringUtils
Defined in:
lib/string_utils.rb,
lib/string_utils/version.rb,
lib/string_utils/transliteration.rb

Overview

StringUtils is a library that provides various handy string manipulation methods Example usage:

* StringUtils.truncate("hello world", 10, "...") #=> "hello..."
* StringUtils.normalize_name "\302\240  Gran Via/Avda.de Asturias " #=> :Gran Via / Avda. de Asturias"
* StringUtils.urlify("waßer") #=> "wasser"

Constant Summary collapse

NBSP =
"\302\240"
WHITESPACE_MATCHER =
"(?:\s|#{NBSP})"
WHITESPACE =
/#{WHITESPACE_MATCHER}/
NOT_WHITESPACE =
"[^\s#{NBSP}]"
WHITESPACES =
/#{WHITESPACE_MATCHER}+/
VERSION =
"1.0.4"
TRANSLITERATIONS =

Based on transliteration table from i18n v0.5.0

{
    # Latin      
    "À" =>"A", "Á"=>"A", "Â"=>"A", "Ã"=>"A", "Ä"=>"A", "Å"=>"A", "Æ"=>"AE",
    "Ç" =>"C", "È"=>"E", "É"=>"E", "Ê"=>"E", "Ë"=>"E", "Ì"=>"I", "Í"=>"I",
    "Î" =>"I", "Ï"=>"I", "Ð"=>"D", "Ñ"=>"N", "Ò"=>"O", "Ó"=>"O", "Ô"=>"O",
    "Õ" =>"O", "Ö"=>"O", "×"=>"x", "Ø"=>"O", "Ù"=>"U", "Ú"=>"U", "Û"=>"U",
    "Ü" =>"U", "Ý"=>"Y", "Þ"=>"Th", "ß"=>"ss", "à"=>"a", "á"=>"a", "â"=>"a",
    "ã" =>"a", "ä"=>"a", "å"=>"a", "æ"=>"ae", "ç"=>"c", "è"=>"e", "é"=>"e",
    "ê" =>"e", "ë"=>"e", "ì"=>"i", "í"=>"i", "î"=>"i", "ï"=>"i", "ð"=>"d",
    "ñ" =>"n", "ò"=>"o", "ó"=>"o", "ô"=>"o", "õ"=>"o", "ö"=>"o", "ø"=>"o",
    "ù" =>"u", "ú"=>"u", "û"=>"u", "ü"=>"u", "ý"=>"y", "þ"=>"th", "ÿ"=>"y",
    "Ā" =>"A", "ā"=>"a", "Ă"=>"A", "ă"=>"a", "Ą"=>"A", "ą"=>"a", "Ć"=>"C",
    "ć" =>"c", "Ĉ"=>"C", "ĉ"=>"c", "Ċ"=>"C", "ċ"=>"c", "Č"=>"C", "č"=>"c",
    "Ď" =>"D", "ď"=>"d", "Đ"=>"D", "đ"=>"d", "Ē"=>"E", "ē"=>"e", "Ĕ"=>"E",
    "ĕ" =>"e", "Ė"=>"E", "ė"=>"e", "Ę"=>"E", "ę"=>"e", "Ě"=>"E", "ě"=>"e",
    "Ĝ" =>"G", "ĝ"=>"g", "Ğ"=>"G", "ğ"=>"g", "Ġ"=>"G", "ġ"=>"g", "Ģ"=>"G",
    "ģ" =>"g", "Ĥ"=>"H", "ĥ"=>"h", "Ħ"=>"H", "ħ"=>"h", "Ĩ"=>"I", "ĩ"=>"i",
    "Ī" =>"I", "ī"=>"i", "Ĭ"=>"I", "ĭ"=>"i", "Į"=>"I", "į"=>"i", "İ"=>"I",
    "ı" =>"i", "IJ"=>"IJ", "ij"=>"ij", "Ĵ"=>"J", "ĵ"=>"j", "Ķ"=>"K", "ķ"=>"k",
    "ĸ" =>"k", "Ĺ"=>"L", "ĺ"=>"l", "Ļ"=>"L", "ļ"=>"l", "Ľ"=>"L", "ľ"=>"l",
    "Ŀ" =>"L", "ŀ"=>"l", "Ł"=>"L", "ł"=>"l", "Ń"=>"N", "ń"=>"n", "Ņ"=>"N",
    "ņ" =>"n", "Ň"=>"N", "ň"=>"n", "ʼn"=>"'n", "Ŋ"=>"NG", "ŋ"=>"ng",
    "Ō" =>"O", "ō"=>"o", "Ŏ"=>"O", "ŏ"=>"o", "Ő"=>"O", "ő"=>"o", "Œ"=>"OE",
    "œ" =>"oe", "Ŕ"=>"R", "ŕ"=>"r", "Ŗ"=>"R", "ŗ"=>"r", "Ř"=>"R", "ř"=>"r",
    "Ś" =>"S", "ś"=>"s", "Ŝ"=>"S", "ŝ"=>"s", "Ş"=>"S", "ş"=>"s", "Š"=>"S",
    "š" =>"s", "Ţ"=>"T", "ţ"=>"t", "Ť"=>"T", "ť"=>"t", "Ŧ"=>"T", "ŧ"=>"t",
    "Ũ" =>"U", "ũ"=>"u", "Ū"=>"U", "ū"=>"u", "Ŭ"=>"U", "ŭ"=>"u", "Ů"=>"U",
    "ů" =>"u", "Ű"=>"U", "ű"=>"u", "Ų"=>"U", "ų"=>"u", "Ŵ"=>"W", "ŵ"=>"w",
    "Ŷ" =>"Y", "ŷ"=>"y", "Ÿ"=>"Y", "Ź"=>"Z", "ź"=>"z", "Ż"=>"Z", "ż"=>"z",
    "Ž" =>"Z", "ž"=>"z",

    # Cyrillic
    "Ґ" =>"G", "Ё"=>"YO", "Є"=>"E", "Ї"=>"YI", "І"=>"I",
    "А" =>"A", "Б"=>"B", "В"=>"V", "Г"=>"G",
    "Д" =>"D", "Е"=>"E", "Ж"=>"ZH", "З"=>"Z", "И"=>"I",
    "Й" =>"Y", "К"=>"K", "Л"=>"L", "М"=>"M", "Н"=>"N",
    "О" =>"O", "П"=>"P", "Р"=>"R", "С"=>"S", "Т"=>"T",
    "У" =>"U", "Ф"=>"F", "Х"=>"H", "Ц"=>"TS", "Ч"=>"CH",
    "Ш" =>"SH", "Щ"=>"SCH", "Ъ"=>"'", "Ы"=>"Y", "Ь"=>"",
    "Э" =>"E", "Ю"=>"YU", "Я"=>"YA", "і"=>"i",
    "ґ" =>"g", "ё"=>"yo", ""=>"#", "є"=>"e",
    "ї" =>"yi", "а"=>"a", "б"=>"b",
    "в" =>"v", "г"=>"g", "д"=>"d", "е"=>"e", "ж"=>"zh",
    "з" =>"z", "и"=>"i", "й"=>"y", "к"=>"k", "л"=>"l",
    "м" =>"m", "н"=>"n", "о"=>"o", "п"=>"p", "р"=>"r",
    "с" =>"s", "т"=>"t", "у"=>"u", "ф"=>"f", "х"=>"h",
    "ц" =>"ts", "ч"=>"ch", "ш"=>"sh", "щ"=>"sch", "ъ"=>"'",
    "ы" =>"y", "ь"=>"", "э"=>"e", "ю"=>"yu", "я"=>"ya",

    # Greek
    'α' => 'a',
    'η' => 'h',
    'ν' => 'n',
    'τ' => 't',
    'β' => 'b',
    'θ' => 'th',
    'ξ' => 'x',
    'υ' => 'y',
    'γ' => 'g',
    'ι' => 'i',
    'ο' => 'o',
    'φ' => 'f',
    'δ' => 'd',
    'κ' => 'k',
    'π' => 'p',
    'χ' => 'ch',
    'ε' => 'e',
    'λ' => 'l',
    'ρ' => 'r',
    'ψ' => 'ps',
    'ζ' => 'z',
    'μ' => 'm',
    'σ' => 's',
    'ω' => 'w',
    'Θ' => 'Th',
    'Ξ' => 'X',
    'Γ' => 'G',
    'Φ' => 'F',
    'Δ' => 'D',
    'Π' => 'P',
    'Λ' => 'L',
    'Ρ' => 'R',
    'Ψ' => 'Ps',
    'Σ' => 'S',
    'Ω' => 'W'
}

Instance Method Summary collapse

Instance Method Details

#mb_charify(text) ⇒ Object

Returns a unicode compatible version of the string

support any of:

* ruby 1.9 sane utf8 support
* rails 2.1 workaround for crappy ruby 1.8 utf8 support
* rails 2.2 workaround for crappy ruby 1.8 utf8 support

hooray!



135
136
137
138
139
140
141
142
143
# File 'lib/string_utils.rb', line 135

def mb_charify(text)
  if RUBY_VERSION >= '1.9'
    text.dup
  elsif text.respond_to?(:mb_chars)
    text.frozen? ? text.dup.mb_chars : text.mb_chars
  else
    raise "StringUtils: No unicode support for strings"
  end
end

#normalize_name(value, options = {}) ⇒ Object

Normalizes whitespace “a , a” => “a, a” “a ,a” => “a, a” “a,a” => “a, a” “a/b” => “a / b”, “a/ b” => “a / b”, “a /b” => “a / b” Removes trailing and leading [.,] options: => true (default false)



45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# File 'lib/string_utils.rb', line 45

def normalize_name(value, options = {})
  value = mb_charify(value)

  # Normalize whitespace
  value.gsub!("\n", ' ')
  value.gsub!(WHITESPACES, ' ')
  value.strip!

  # Remove trailing and leading .,
  value.gsub!(/^[.,]/, '')
  value.gsub!(/[.,]$/, '')

  # "a ,a"  => "a, a"
  # "a,a"   => "a, a"
  # "a , a" => "a, a"
  value.gsub!(/#{WHITESPACE_MATCHER}([,.])/, '\1')
  value.gsub!(/([,.])(#{NOT_WHITESPACE})/, '\1 \2')

  # "//" => "/"
  value.gsub!(/\/+/, '/')

  # "a/b" => "a / b", "a/ b" => "a / b", "a /b" => "a / b"
  value.gsub!(/(#{NOT_WHITESPACE})\//, '\1 /')
  value.gsub!(/\/(#{NOT_WHITESPACE})/, '/ \1')

  if options[:titleize]
    value = value.titleize
    value.gsub!(/#{WHITESPACE_MATCHER}(Of|And|A|An|The|To)#{WHITESPACE_MATCHER}/) { |m| "#{m.downcase}" }
  end
  value.to_s
end

#truncate(text, *args) ⇒ Object

Truncates the string The result will be :length or shorter, and the words will not be cut in the middle Arguments: :length => Integer (default: 30) :omission => String (default: “…”)



82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# File 'lib/string_utils.rb', line 82

def truncate(text, *args)
  options = args.last.is_a?(Hash) ? args.pop : {}

  # support either old or Rails 2.2 calling convention:
  unless args.empty?
    options[:length]   = args[0] || 30
    options[:omission] = args[1] || ""
  end

  options          = {:length => 30, :omission => ""}.merge(options)
  options[:length] = options[:length].to_i

  return "" if !text
  chars = mb_charify(text)


  # If we can return it straight away or rstrip it and return it, we do it here
  if chars.length <= options[:length]
    return text
  elsif (chars = rstrip_with_nbsp(chars)).length <= options[:length]
    return chars.to_s
  end

  omission           = mb_charify(options[:omission])

  # Here we know we have to remove at least 1 word
  # 1. Get the first l characters
  # 2. Remove the last word if it's a part
  # 3. Add omission
  length_wo_omission = options[:length] - omission.length

  return '' if length_wo_omission < 0

  result = rstrip_with_nbsp(chars[0...length_wo_omission] || "")

  # Remove the last word unless we happened to trim it exactly already
  unless chars[length_wo_omission] =~ WHITESPACE || result.length < length_wo_omission
    len    = result.split(WHITESPACES).last
    len    &&= len.length
    result = rstrip_with_nbsp(result[0...(result.length - (len || 0))])
  end

  result += options[:omission]
  result.to_s
end

#urlify(string, opts = {}) ⇒ Object

Converts a string to a nicely readable URL opts: :default_replacement – string to use for unknown characters (Default: “”) :whitespace_replacement – string to use to replace whitespace+ (Default: “-”)



29
30
31
32
33
34
35
36
# File 'lib/string_utils.rb', line 29

def urlify(string, opts = {})
  opts = {:whitespace_replacement => '-', :default_replacement => ""}.merge(opts)
  string = string.gsub(WHITESPACES, opts[:whitespace_replacement])
  string.strip!
  string.gsub!(/[^\x00-\x7f]/u) { |char| TRANSLITERATIONS[char] || opts[:default_replacement] }
  string.gsub!(/[^a-z0-9\-+_]/, opts[:default_replacement])
  string
end