Module: Sinew::TextUtil

Extended by:
TextUtil
Included in:
TextUtil
Defined in:
lib/sinew/text_util.rb

Constant Summary collapse

ATTRS_KEEP =
Set.new %w(a img iframe)
TIDY_OPTIONS =
{
  "-asxml" => nil,
  "-bare" => nil,
  "-quiet" => nil,
  "-utf8" => nil,    
  "-wrap" =>  0,
  "--doctype" => "omit",
  "--hide-comments" => "yes",
  "--force-output" => "yes",    
  "-f" => "/dev/null",
}
XML_ENTITIES =
{ "&"=>"&amp;", "<"=>"&lt;", ">"=>"&gt;", "'"=>"&apos;", '"'=>"&quot;" }
XML_ENTITIES_INV =
XML_ENTITIES.invert
COMMON_ENTITIES_INV =
XML_ENTITIES_INV.merge(
"&frac12;" => "1/2",
"&frac14;" => "1/4",
"&frac34;" => "3/4",
"&ldquo;" => '"',
"&lsquo;" => "'",
"&mdash;" => "-",
"&nbsp;" => " ",
"&ndash;" => "-",
"&rdquo;" => '"',
"&rsquo;" => "'",
"&tilde;" => "~",
"&#34;" => '"',
"&#39;" => "'",
"&#160;" => " ",
"&#8232;" => "\n"
)

Instance Method Summary collapse

Instance Method Details

#html_clean(s) ⇒ Object



68
69
70
# File 'lib/sinew/text_util.rb', line 68

def html_clean(s)
  html_clean_from_tidy(html_tidy(s))
end

#html_clean_from_tidy(s) ⇒ Object



72
73
74
75
76
77
78
79
# File 'lib/sinew/text_util.rb', line 72

def html_clean_from_tidy(s)
  # then kill most attrs
  s = s.dup
  s.gsub!(/<([^\s>]+)[^>]*?(\/)?>/) do |i|
    ATTRS_KEEP.include?($1) ? i : "<#{$1}#{$2}>"
  end
  s
end

#html_tidy(s) ⇒ Object

tidy/clean



45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# File 'lib/sinew/text_util.rb', line 45

def html_tidy(s)
  # run tidy
  args = TIDY_OPTIONS.map { |k, v| "#{k} #{v}" }.join(" ")
  s = IO.popen("tidy #{args}", "rb+") do |f|
    f.write(s)
    f.close_write
    f.read
  end
  raise "could not run tidy" if ($? >> 8) > 2

  # now kill some tags
  s.sub!(/<html\b[^>]+>/, "<html>")
  s.gsub!(/<\/?(meta|link)\b[^>]*>/m, "")
  s.gsub!(/<(style|script)\b[^>]*(\/>|>.*?<\/\1\b>)/m, "")    
  s.gsub!(/<\?[^>]*>/m, "")
  s.squish!

  # kill whitespace around tags
  s.gsub!(/ ?<([^>]+)> ?/, "<\\1>")
  
  s
end

#unent(s) ⇒ Object



97
98
99
# File 'lib/sinew/text_util.rb', line 97

def unent(s)
  s.gsub(/&#?[a-z0-9]{2,};/) { |i| COMMON_ENTITIES_INV[i] }
end

#untag(s) ⇒ Object



93
94
95
# File 'lib/sinew/text_util.rb', line 93

def untag(s)
  s.gsub(/<[^>]+>/, " ")    
end

#xml_escape(s) ⇒ Object

untag/unent



85
86
87
# File 'lib/sinew/text_util.rb', line 85

def xml_escape(s)
  s.gsub(/[&<>'"]/) { |i| XML_ENTITIES[i] }
end

#xml_unescape(s) ⇒ Object



89
90
91
# File 'lib/sinew/text_util.rb', line 89

def xml_unescape(s)
  s.gsub(/&(amp|lt|gt|apos|quot);/) { |i| XML_ENTITIES_INV[i] }
end