Module: Sinew::TextUtil
Constant Summary collapse
- ATTRS_KEEP =
Set.new %w(a img iframe)
- TIDY_OPTIONS =
{ "-asxml" => nil, "-bare" => nil, "-quiet" => nil, "-utf8" => nil, "-wrap" => 0, "--doctype" => "omit", "--hide-comments" => "yes", "--force-output" => "yes", "-f" => "/dev/null", }
- XML_ENTITIES =
{ "&"=>"&", "<"=>"<", ">"=>">", "'"=>"'", '"'=>""" }
- XML_ENTITIES_INV =
XML_ENTITIES.invert
- COMMON_ENTITIES_INV =
XML_ENTITIES_INV.merge( "½" => "1/2", "¼" => "1/4", "¾" => "3/4", "“" => '"', "‘" => "'", "—" => "-", " " => " ", "–" => "-", "”" => '"', "’" => "'", "˜" => "~", """ => '"', "'" => "'", " " => " ", "
" => "\n" )
Instance Method Summary collapse
- #html_clean(s) ⇒ Object
- #html_clean_from_tidy(s) ⇒ Object
-
#html_tidy(s) ⇒ Object
tidy/clean.
- #unent(s) ⇒ Object
- #untag(s) ⇒ Object
-
#xml_escape(s) ⇒ Object
untag/unent.
- #xml_unescape(s) ⇒ Object
Instance Method Details
#html_clean(s) ⇒ Object
68 69 70 |
# File 'lib/sinew/text_util.rb', line 68 def html_clean(s) html_clean_from_tidy(html_tidy(s)) end |
#html_clean_from_tidy(s) ⇒ Object
72 73 74 75 76 77 78 79 |
# File 'lib/sinew/text_util.rb', line 72 def html_clean_from_tidy(s) # then kill most attrs s = s.dup s.gsub!(/<([^\s>]+)[^>]*?(\/)?>/) do |i| ATTRS_KEEP.include?($1) ? i : "<#{$1}#{$2}>" end s end |
#html_tidy(s) ⇒ Object
tidy/clean
45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
# File 'lib/sinew/text_util.rb', line 45 def html_tidy(s) # run tidy args = TIDY_OPTIONS.map { |k, v| "#{k} #{v}" }.join(" ") s = IO.popen("tidy #{args}", "rb+") do |f| f.write(s) f.close_write f.read end raise "could not run tidy" if ($? >> 8) > 2 # now kill some tags s.sub!(/<html\b[^>]+>/, "<html>") s.gsub!(/<\/?(meta|link)\b[^>]*>/m, "") s.gsub!(/<(style|script)\b[^>]*(\/>|>.*?<\/\1\b>)/m, "") s.gsub!(/<\?[^>]*>/m, "") s.squish! # kill whitespace around tags s.gsub!(/ ?<([^>]+)> ?/, "<\\1>") s end |
#unent(s) ⇒ Object
97 98 99 |
# File 'lib/sinew/text_util.rb', line 97 def unent(s) s.gsub(/&#?[a-z0-9]{2,};/) { |i| COMMON_ENTITIES_INV[i] } end |
#untag(s) ⇒ Object
93 94 95 |
# File 'lib/sinew/text_util.rb', line 93 def untag(s) s.gsub(/<[^>]+>/, " ") end |
#xml_escape(s) ⇒ Object
untag/unent
85 86 87 |
# File 'lib/sinew/text_util.rb', line 85 def xml_escape(s) s.gsub(/[&<>'"]/) { |i| XML_ENTITIES[i] } end |
#xml_unescape(s) ⇒ Object
89 90 91 |
# File 'lib/sinew/text_util.rb', line 89 def xml_unescape(s) s.gsub(/&(amp|lt|gt|apos|quot);/) { |i| XML_ENTITIES_INV[i] } end |