Module: String::Cleaner

Included in:
String
Defined in:
lib/string_cleaner.rb

Constant Summary collapse

SPECIAL_SPACES =
[
  0x00A0,                # NO-BREAK SPACE
  0x1680,                # OGHAM SPACE MARK
  0x180E,                # MONGOLIAN VOWEL SEPARATOR
  (0x2000..0x200A).to_a, # EN QUAD..HAIR SPACE
  0x2028,                # LINE SEPARATOR
  0x2029,                # PARAGRAPH SEPARATOR
  0x202F,                # NARROW NO-BREAK SPACE
  0x205F,                # MEDIUM MATHEMATICAL SPACE
  0x3000,                # IDEOGRAPHIC SPACE
].flatten.collect{|e| [e].pack 'U*'}
ZERO_WIDTH =
[
  0x200B,                # ZERO WIDTH SPACE
  0x200C,                # ZERO WIDTH NON-JOINER
  0x200D,                # ZERO WIDTH JOINER
  0x2060,                # WORD JOINER
  0xFEFF,                # ZERO WIDTH NO-BREAK SPACE
].flatten.collect{|e| [e].pack 'U*'}

Instance Method Summary collapse

Instance Method Details

#chartable(options = {}) ⇒ Object



89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
# File 'lib/string_cleaner.rb', line 89

def chartable(options = {})
  options = {
    :clean_binary => true,
    :translit_symbols => true,
  }.merge(options)
  char = "%c"
  table = {
    "`" => "'",  # dec = 96
    "¦" => "|",  # dec = 166, broken vertical bar
    "¨" => "",   # dec = 168, spacing diaeresis - umlaut
    "ª" => "",   # dec = 170, feminine ordinal indicator
    "«" => "\"", # dec = 171, left double angle quotes
    "¬" => "!",  # dec = 172, not sign
    "­" => "-",  # dec = 173, soft hyphen
    "¯" => "-",  # dec = 175, spacing macron - overline
    "²" => "2",  # dec = 178, superscript two - squared
    "³" => "3",  # dec = 179, superscript three - cubed
    "´" => "'",  # dec = 180, acute accent - spacing acute
    "·" => "",   # dec = 183, middle dot - Georgian comma
    "¸" => "",   # dec = 184, spacing cedilla
    "¹" => "1",  # dec = 185, superscript one
    "º" => "0",  # dec = 186, masculine ordinal indicator
    "»" => "\"", # dec = 187, right double angle quotes
    "¿" => "",   # dec = 191, inverted question mark
    "Ý" => "Y",  # dec = 221
    "" => "-",  # hex = 2013, en dash
    "" => "-",  # hex = 2014, em dash
    "" => "'",  # hex = 201A, single low-9 quotation mark
    "" => "\"", # hex = 201E, double low-9 quotation mark
  }
  if options[:clean_binary]
    table[char %   0] = ""  # null
    table[char %   1] = ""  # start of heading
    table[char %   2] = ""  # start of text
    table[char %   3] = ""  # end of text
    table[char %   4] = ""  # end of transmission
    table[char %   5] = ""  # enquiry
    table[char %   6] = ""  # acknowledge
    table[char %   7] = ""  # bell
    table[char %   8] = ""  # backspace
    table[char %   9] = " " # tab
    table[char %  11] = ""  # vertical tab
    table[char %  12] = ""  # form feed
    table[char %  14] = ""  # shift out
    table[char %  15] = ""  # shift in
    table[char %  16] = ""  # data link escape
    table[char %  17] = ""  # device control 1
    table[char %  18] = ""  # device control 2
    table[char %  19] = ""  # device control 3
    table[char %  20] = ""  # device control 4
    table[char %  21] = ""  # negative acknowledgement
    table[char %  22] = ""  # synchronous idle
    table[char %  23] = ""  # end of transmission block
    table[char %  24] = ""  # cancel
    table[char %  25] = ""  # end of medium
    table[char %  26] = ""  # substitute
    table[char %  27] = ""  # escape
    table[char %  28] = ""  # file separator
    table[char %  29] = ""  # group separator
    table[char %  30] = ""  # record separator
    table[char %  31] = ""  # unit separator
    table[char % 127] = ""  # delete
  end
  if options[:translit_symbols]
    table["$"]        = " dollars "              # dec = 36, dollar sign
    table["%"]        = " percent "              # dec = 37, percent sign
    table["&"]        = " and "                  # dec = 38, ampersand
    table["@"]        = " at "                   # dec = 64, at symbol
    table[char % 128] = " euros "                # windows euro
    table["¢"]        = " cents "                # dec = 162, cent sign
    table["£"]        = " pounds "               # dec = 163, pound sign
    table["¤"]        = " euros "                # dec = 164, currency sign
    table["¥"]        = " yens "                 # dec = 165, yen sign
    table["§"]        = " section "              # dec = 167, section sign
    table["©"]        = " copyright "            # dec = 169, copyright sign
    table["®"]        = " registered trademark " # dec = 174, registered trade mark sign
    table["°"]        = " degrees "              # dec = 176, degree sign
    table["±"]        = " approx "               # dec = 177, plus-or-minus sign
    table["µ"]        = " micro "                # dec = 181, micro sign
    table[""]        = " paragraph "            # dec = 182, pilcrow sign - paragraph sign
    table["¼"]        = " 1/4 "                  # dec = 188, fraction one quarter
    table["½"]        = " 1/2 "                  # dec = 189, fraction one half
    table["¾"]        = " 3/4 "                  # dec = 190, fraction three quarters
    table[""]        = " euros "                # hex = 20AC, unicode euro
    table[""]        = " trademark "            # hex = 2122, trade mark
  end
  table
end

#cleanObject



6
7
8
# File 'lib/string_cleaner.rb', line 6

def clean
  fix_encoding.fix_endlines.fix_invisible_chars
end

#fix_encodingObject



10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# File 'lib/string_cleaner.rb', line 10

def fix_encoding
  utf8 = dup
  if utf8.respond_to?(:force_encoding)
    utf8.force_encoding("UTF-8") # for Ruby 1.9+
    unless utf8.valid_encoding? # if invalid UTF-8
      utf8 = utf8.force_encoding("ISO8859-1")
      utf8.encode!("UTF-8", :invalid => :replace, :undef => :replace, :replace => "")
    end
    utf8.gsub!(/\u0080|¤/, "") # special case for euro sign from Windows-1252
    utf8
  else
    require "iconv"
    utf8 << " "
    begin
      Iconv.new("UTF-8", "UTF-8").iconv(utf8)
    rescue
      utf8.gsub!(/\x80/n, "\xA4")
      Iconv.new("UTF-8//IGNORE", "ISO8859-1").iconv(utf8).gsub("¤", "")
    end
  end
end

#fix_endlinesObject



32
33
34
# File 'lib/string_cleaner.rb', line 32

def fix_endlines
  gsub(/(?:\r\n|\r)/u, "\n")
end

#fix_invisible_charsObject



56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# File 'lib/string_cleaner.rb', line 56

def fix_invisible_chars
  utf8 = self.dup
  utf8.gsub!(Regexp.new(ZERO_WIDTH.join("|")), "")
  utf8 = if utf8.respond_to?(:force_encoding)
    utf8 = (utf8 << " ").split(/\n/u).each{|line|
      line.gsub!(/[\s\p{C}]/u, " ")
    }.join("\n").chop!
  else
    require "oniguruma"
    utf8.split(/\n/n).collect{|line|
      Oniguruma::ORegexp.new("[\\p{C}]", {:encoding => Oniguruma::ENCODING_UTF8}).gsub(line, " ")
    }.join("\n").chop!
  end
  utf8.gsub!(Regexp.new(SPECIAL_SPACES.join("|") + "|\s"), " ")
  utf8
end

#nl2brObject



81
82
83
# File 'lib/string_cleaner.rb', line 81

def nl2br
  gsub("\n", "<br/>\n")
end

#to_nicer_symObject



85
86
87
# File 'lib/string_cleaner.rb', line 85

def to_nicer_sym
  to_permalink("_").to_sym
end


77
78
79
# File 'lib/string_cleaner.rb', line 77

def to_permalink(separator="-")
  clean.to_ascii(chartable).downcase.gsub(/[^a-z0-9]+/, separator).trim(separator)
end

#trim(chars = "") ⇒ Object



73
74
75
# File 'lib/string_cleaner.rb', line 73

def trim(chars = "")
  chars.size>0 ? gsub(/\A[#{chars}]+|[#{chars}]+\z/, "") : strip
end