Module: StringUtils

Extended by:: StringUtils

Included in:: StringUtils

Defined in:: lib/string_utils.rb,
lib/string_utils/version.rb,
lib/string_utils/transliteration.rb

Overview

StringUtils is a library that provides various handy string manipulation methods Example usage:

* StringUtils.truncate("hello world", 10, "...") #=> "hello..."
* StringUtils.normalize_name "\302\240  Gran Via/Avda.de Asturias " #=> :Gran Via / Avda. de Asturias"
* StringUtils.urlify("waßer") #=> "wasser"
* StringUtils.normalize_punctuation(" , a,,b ,") #=> "a, b"

Constant Summary collapse

NBSP =

"\302\240"

WHITESPACE_MATCHER =

"(?:\s|#{NBSP})"

WHITESPACE =

/#{WHITESPACE_MATCHER}/

NOT_WHITESPACE =

"[^\s#{NBSP}]"

WHITESPACES =

/#{WHITESPACE_MATCHER}+/

VERSION =

"1.0.8"

TRANSLITERATIONS = Based on transliteration table from i18n v0.5.0

{
    # Latin
    "À" =>"A", "Á"=>"A", "Â"=>"A", "Ã"=>"A", "Ä"=>"A", "Å"=>"A", "Æ"=>"AE",
    "Ç" =>"C", "È"=>"E", "É"=>"E", "Ê"=>"E", "Ë"=>"E", "Ì"=>"I", "Í"=>"I",
    "Î" =>"I", "Ï"=>"I", "Ð"=>"D", "Ñ"=>"N", "Ò"=>"O", "Ó"=>"O", "Ô"=>"O",
    "Õ" =>"O", "Ö"=>"O", "×"=>"x", "Ø"=>"O", "Ù"=>"U", "Ú"=>"U", "Û"=>"U",
    "Ü" =>"U", "Ý"=>"Y", "Þ"=>"Th", "ß"=>"ss", "à"=>"a", "á"=>"a", "â"=>"a",
    "ã" =>"a", "ä"=>"a", "å"=>"a", "æ"=>"ae", "ç"=>"c", "è"=>"e", "é"=>"e",
    "ê" =>"e", "ë"=>"e", "ì"=>"i", "í"=>"i", "î"=>"i", "ï"=>"i", "ð"=>"d",
    "ñ" =>"n", "ò"=>"o", "ó"=>"o", "ô"=>"o", "õ"=>"o", "ö"=>"o", "ø"=>"o",
    "ù" =>"u", "ú"=>"u", "û"=>"u", "ü"=>"u", "ý"=>"y", "þ"=>"th", "ÿ"=>"y",
    "Ā" =>"A", "ā"=>"a", "Ă"=>"A", "ă"=>"a", "Ą"=>"A", "ą"=>"a", "Ć"=>"C",
    "ć" =>"c", "Ĉ"=>"C", "ĉ"=>"c", "Ċ"=>"C", "ċ"=>"c", "Č"=>"C", "č"=>"c",
    "Ď" =>"D", "ď"=>"d", "Đ"=>"D", "đ"=>"d", "Ē"=>"E", "ē"=>"e", "Ĕ"=>"E",
    "ĕ" =>"e", "Ė"=>"E", "ė"=>"e", "Ę"=>"E", "ę"=>"e", "Ě"=>"E", "ě"=>"e",
    "Ĝ" =>"G", "ĝ"=>"g", "Ğ"=>"G", "ğ"=>"g", "Ġ"=>"G", "ġ"=>"g", "Ģ"=>"G",
    "ģ" =>"g", "Ĥ"=>"H", "ĥ"=>"h", "Ħ"=>"H", "ħ"=>"h", "Ĩ"=>"I", "ĩ"=>"i",
    "Ī" =>"I", "ī"=>"i", "Ĭ"=>"I", "ĭ"=>"i", "Į"=>"I", "į"=>"i", "İ"=>"I",
    "ı" =>"i", "Ĳ"=>"IJ", "ĳ"=>"ij", "Ĵ"=>"J", "ĵ"=>"j", "Ķ"=>"K", "ķ"=>"k",
    "ĸ" =>"k", "Ĺ"=>"L", "ĺ"=>"l", "Ļ"=>"L", "ļ"=>"l", "Ľ"=>"L", "ľ"=>"l",
    "Ŀ" =>"L", "ŀ"=>"l", "Ł"=>"L", "ł"=>"l", "Ń"=>"N", "ń"=>"n", "Ņ"=>"N",
    "ņ" =>"n", "Ň"=>"N", "ň"=>"n", "ŉ"=>"'n", "Ŋ"=>"NG", "ŋ"=>"ng",
    "Ō" =>"O", "ō"=>"o", "Ŏ"=>"O", "ŏ"=>"o", "Ő"=>"O", "ő"=>"o", "Œ"=>"OE",
    "œ" =>"oe", "Ŕ"=>"R", "ŕ"=>"r", "Ŗ"=>"R", "ŗ"=>"r", "Ř"=>"R", "ř"=>"r",
    "Ś" =>"S", "ś"=>"s", "Ŝ"=>"S", "ŝ"=>"s", "Ş"=>"S", "ş"=>"s", "Š"=>"S",
    "š" =>"s", "Ţ"=>"T", "ţ"=>"t", "Ť"=>"T", "ť"=>"t", "Ŧ"=>"T", "ŧ"=>"t",
    "Ũ" =>"U", "ũ"=>"u", "Ū"=>"U", "ū"=>"u", "Ŭ"=>"U", "ŭ"=>"u", "Ů"=>"U",
    "ů" =>"u", "Ű"=>"U", "ű"=>"u", "Ų"=>"U", "ų"=>"u", "Ŵ"=>"W", "ŵ"=>"w",
    "Ŷ" =>"Y", "ŷ"=>"y", "Ÿ"=>"Y", "Ź"=>"Z", "ź"=>"z", "Ż"=>"Z", "ż"=>"z",
    "Ž" =>"Z", "ž"=>"z",

    # Cyrillic
    "Ґ" =>"G", "Ё"=>"YO", "Є"=>"E", "Ї"=>"YI", "І"=>"I",
    "А" =>"A", "Б"=>"B", "В"=>"V", "Г"=>"G",
    "Д" =>"D", "Е"=>"E", "Ж"=>"ZH", "З"=>"Z", "И"=>"I",
    "Й" =>"Y", "К"=>"K", "Л"=>"L", "М"=>"M", "Н"=>"N",
    "О" =>"O", "П"=>"P", "Р"=>"R", "С"=>"S", "Т"=>"T",
    "У" =>"U", "Ф"=>"F", "Х"=>"H", "Ц"=>"TS", "Ч"=>"CH",
    "Ш" =>"SH", "Щ"=>"SCH", "Ъ"=>"'", "Ы"=>"Y", "Ь"=>"",
    "Э" =>"E", "Ю"=>"YU", "Я"=>"YA", "і"=>"i",
    "ґ" =>"g", "ё"=>"yo", "№"=>"#", "є"=>"e",
    "ї" =>"yi", "а"=>"a", "б"=>"b",
    "в" =>"v", "г"=>"g", "д"=>"d", "е"=>"e", "ж"=>"zh",
    "з" =>"z", "и"=>"i", "й"=>"y", "к"=>"k", "л"=>"l",
    "м" =>"m", "н"=>"n", "о"=>"o", "п"=>"p", "р"=>"r",
    "с" =>"s", "т"=>"t", "у"=>"u", "ф"=>"f", "х"=>"h",
    "ц" =>"ts", "ч"=>"ch", "ш"=>"sh", "щ"=>"sch", "ъ"=>"'",
    "ы" =>"y", "ь"=>"", "э"=>"e", "ю"=>"yu", "я"=>"ya",

    # Greek
    'α' => 'a',
    'η' => 'h',
    'ν' => 'n',
    'τ' => 't',
    'β' => 'b',
    'θ' => 'th',
    'ξ' => 'x',
    'υ' => 'y',
    'γ' => 'g',
    'ι' => 'i',
    'ο' => 'o',
    'φ' => 'f',
    'δ' => 'd',
    'κ' => 'k',
    'π' => 'p',
    'χ' => 'ch',
    'ε' => 'e',
    'λ' => 'l',
    'ρ' => 'r',
    'ψ' => 'ps',
    'ζ' => 'z',
    'μ' => 'm',
    'σ' => 's',
    'ω' => 'w',
    'Θ' => 'Th',
    'Ξ' => 'X',
    'Γ' => 'G',
    'Φ' => 'F',
    'Δ' => 'D',
    'Π' => 'P',
    'Λ' => 'L',
    'Ρ' => 'R',
    'Ψ' => 'Ps',
    'Σ' => 'S',
    'Ω' => 'W'
}

Instance Method Summary collapse

#mb_charify(text) ⇒ Object

Returns a unicode compatible version of the string.
#normalize_name(value, options = {}) ⇒ Object

Normalizes whitespace “a , a” => “a, a” “a ,a” => “a, a” “a,a” => “a, a” “a/b” => “a / b”, “a/ b” => “a / b”, “a /b” => “a / b” Removes trailing and leading [.,] options: => true (default false).
#normalize_punctuation(str) ⇒ Object

Collapses spaces and commas Fixes spacing around the following characters: ,.;:& Removes consecutive character dupes Removes trailing and leading commas.
#truncate(text, *args) ⇒ Object

Truncates the string The result will be :length or shorter, and the words will not be cut in the middle Arguments: :length => Integer (default: 30) :omission => String (default: “…”).
#urlify(string, opts = {}) ⇒ Object

Converts a string to a nicely readable URL opts: :default_replacement – string to use for unknown characters (Default: “”) :whitespace_replacement – string to use to replace whitespace+ (Default: “-”).

Instance Method Details

#mb_charify(text) ⇒ `Object`

Returns a unicode compatible version of the string

support any of:

* ruby 1.9 sane utf8 support
* rails 2.1 workaround for crappy ruby 1.8 utf8 support
* rails 2.2 workaround for crappy ruby 1.8 utf8 support

hooray!

# File 'lib/string_utils.rb', line 166

def mb_charify(text)
  if RUBY_VERSION >= '1.9'
    text.dup
  elsif text.respond_to?(:mb_chars)
    text.frozen? ? text.dup.mb_chars : text.mb_chars
  else
    raise "StringUtils: No unicode support for strings"
  end
end

#normalize_name(value, options = {}) ⇒ `Object`

Normalizes whitespace “a , a” => “a, a” “a ,a” => “a, a” “a,a” => “a, a” “a/b” => “a / b”, “a/ b” => “a / b”, “a /b” => “a / b” Removes trailing and leading [.,] options: => true (default false)

# File 'lib/string_utils.rb', line 73

def normalize_name(value, options = {})
  value = mb_charify(value)

  # Normalize whitespace
  value.gsub!("\n", ' ')
  value.gsub!(WHITESPACES, ' ')
  value.strip!

  # Remove trailing and leading .,
  value.gsub!(/^[.,]/, '')
  value.gsub!(/[.,]$/, '')

  # Remove quote pairs. Imperfect, but good enough
  value.gsub!(/\A['"]+(.*)['"]+\z/, '\1')

  # "a ,a"  => "a, a"
  # "a,a"   => "a, a"
  # "a , a" => "a, a"
  value.gsub!(/#{WHITESPACE_MATCHER}([,.])/, '\1')
  value.gsub!(/([,.])(#{NOT_WHITESPACE})/, '\1 \2')

  # "//" => "/"
  value.gsub!(/\/+/, '/')

  # "a/b" => "a / b", "a/ b" => "a / b", "a /b" => "a / b"
  value.gsub!(/(#{NOT_WHITESPACE})\//, '\1 /')
  value.gsub!(/\/(#{NOT_WHITESPACE})/, '/ \1')

  if options[:titleize]
    value = value.titleize
    value.gsub!(/#{WHITESPACE_MATCHER}(Of|And|A|An|The|To)#{WHITESPACE_MATCHER}/) { |m| "#{m.downcase}" }
  end
  value.to_s
end

#normalize_punctuation(str) ⇒ `Object`

Collapses spaces and commas Fixes spacing around the following characters:

,.;:&

Removes consecutive character dupes Removes trailing and leading commas

# File 'lib/string_utils.rb', line 31

def normalize_punctuation(str)
  s = str.dup
  s.gsub! /\s+/, ' '

  s.gsub(/\s*&,/)

  # Collapse w/s around all
  s.gsub! /\s*([:,&.;])\s*/, '\1'
  # Collapse consecutive dupes
  s.gsub! /([.,;&:])+/ , '\1'

  # Collapse leading and trailing punctuation
  s.gsub! /^\s*[,:&;.]|[.;&:,]\s*$/, ''

  # Add whitespaces
  s.gsub! /([,.;:])(\S)/, '\1 \2'
  s.gsub! /(\S)([&])(\S)/, '\1 \2 \3'

  s.strip!
  s
end

#truncate(text, *args) ⇒ `Object`

Truncates the string The result will be :length or shorter, and the words will not be cut in the middle Arguments: :length => Integer (default: 30) :omission => String (default: “…”)

# File 'lib/string_utils.rb', line 113

def truncate(text, *args)
  options = args.last.is_a?(Hash) ? args.pop : {}

  # support either old or Rails 2.2 calling convention:
  unless args.empty?
    options[:length]   = args[0] || 30
    options[:omission] = args[1] || "…"
  end

  options          = {:length => 30, :omission => "…"}.merge(options)
  options[:length] = options[:length].to_i

  return "" if !text
  chars = mb_charify(text)


  # If we can return it straight away or rstrip it and return it, we do it here
  if chars.length <= options[:length]
    return text
  elsif (chars = rstrip_with_nbsp(chars)).length <= options[:length]
    return chars.to_s
  end

  omission           = mb_charify(options[:omission])

  # Here we know we have to remove at least 1 word
  # 1. Get the first l characters
  # 2. Remove the last word if it's a part
  # 3. Add omission
  length_wo_omission = options[:length] - omission.length

  return '' if length_wo_omission < 0

  result = rstrip_with_nbsp(chars[0...length_wo_omission] || "")

  # Remove the last word unless we happened to trim it exactly already
  unless chars[length_wo_omission] =~ WHITESPACE || result.length < length_wo_omission
    len    = result.split(WHITESPACES).last
    len    &&= len.length
    result = rstrip_with_nbsp(result[0...(result.length - (len || 0))])
  end

  result += options[:omission]
  result.to_s
end

#urlify(string, opts = {}) ⇒ `Object`

Converts a string to a nicely readable URL opts: :default_replacement – string to use for unknown characters (Default: “”) :whitespace_replacement – string to use to replace whitespace+ (Default: “-”)

# File 'lib/string_utils.rb', line 57

def urlify(string, opts = {})
  opts = {:whitespace_replacement => '-', :default_replacement => ""}.merge(opts)
  string = string.gsub(WHITESPACES, opts[:whitespace_replacement])
  string.strip!
  string.gsub!(/[^\x00-\x7f]/u) { |char| TRANSLITERATIONS[char] || opts[:default_replacement] }
  string.gsub!(/[^a-z0-9\-+_]/, opts[:default_replacement])
  string
end

Module: StringUtils

Overview

Constant Summary collapse

Instance Method Summary collapse

Instance Method Details

#mb_charify(text) ⇒ Object

#normalize_name(value, options = {}) ⇒ Object

#normalize_punctuation(str) ⇒ Object

#truncate(text, *args) ⇒ Object

#urlify(string, opts = {}) ⇒ Object

#mb_charify(text) ⇒ `Object`

#normalize_name(value, options = {}) ⇒ `Object`

#normalize_punctuation(str) ⇒ `Object`

#truncate(text, *args) ⇒ `Object`

#urlify(string, opts = {}) ⇒ `Object`