Class: TextUtils::Sanitizier
- Inherits:
-
Object
- Object
- TextUtils::Sanitizier
- Includes:
- LogUtils::Logging
- Defined in:
- lib/textutils/sanitizier.rb
Constant Summary collapse
%w{ head script style }
%w{ span b i u }
%w{ p div ul ol }
Instance Method Summary collapse
- #handle_block_tags(ht) ⇒ Object
- #handle_entities(ht) ⇒ Object
- #handle_ignore_tags(ht) ⇒ Object
- #handle_inline_tags(ht) ⇒ Object
-
#initialize(ht) ⇒ Sanitizier
constructor
A new instance of Sanitizier.
- #tag_regex(tag) ⇒ Object
- #to_plain_text ⇒ Object
Constructor Details
#initialize(ht) ⇒ Sanitizier
Returns a new instance of Sanitizier.
14 15 16 |
# File 'lib/textutils/sanitizier.rb', line 14 def initialize( ht ) @ht = ht # hypertext (html source) end |
Instance Method Details
#handle_block_tags(ht) ⇒ Object
62 63 64 65 66 67 |
# File 'lib/textutils/sanitizier.rb', line 62 def ( ht ) @@block_tags.each do |tag| ht.gsub!( tag_regex(tag), "\n\1\n" ) end ht end |
#handle_entities(ht) ⇒ Object
34 35 36 37 38 39 |
# File 'lib/textutils/sanitizier.rb', line 34 def handle_entities( ht ) ## unescape entities # - check if it also works for generic entities like  etc. # or only for > < etc. ht = CGI.unescapeHTML( ht ) end |
#handle_ignore_tags(ht) ⇒ Object
47 48 49 50 51 52 |
# File 'lib/textutils/sanitizier.rb', line 47 def ( ht ) @@ignore_tags.each do |tag| ht.gsub!( tag_regex(tag), '' ) end ht end |
#handle_inline_tags(ht) ⇒ Object
54 55 56 57 58 59 60 |
# File 'lib/textutils/sanitizier.rb', line 54 def ( ht ) @@inline_tags.each do |tag| # add a space after ht.gsub!( tag_regex(tag), '\1 ' ) end ht end |
#tag_regex(tag) ⇒ Object
41 42 43 44 45 |
# File 'lib/textutils/sanitizier.rb', line 41 def tag_regex( tag ) # note use non-greedy .*? for content /<#{tag}[^>]*>(.*?)<\/#{tag}>/mi end |
#to_plain_text ⇒ Object
18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 |
# File 'lib/textutils/sanitizier.rb', line 18 def to_plain_text ht = @ht ht = ( ht ) ## handle_pre_tags ?? - special rule for preformatted (keep whitespace) ht = ( ht ) ht = ( ht ) ht = ( ht ) # rules for remain/left over tags ht = handle_entities( ht ) ht end |