Class: TextUtils::Sanitizier

Inherits:
Object
  • Object
show all
Includes:
LogUtils::Logging
Defined in:
lib/textutils/sanitizier.rb

Constant Summary collapse

@@ignore_tags =
%w{ head script style }
@@inline_tags =
%w{ span b i u }
@@block_tags =
%w{ p div ul ol }

Instance Method Summary collapse

Constructor Details

#initialize(ht) ⇒ Sanitizier

Returns a new instance of Sanitizier.



14
15
16
# File 'lib/textutils/sanitizier.rb', line 14

def initialize( ht )
  @ht = ht  # hypertext (html source)
end

Instance Method Details

#handle_block_tags(ht) ⇒ Object



62
63
64
65
66
67
# File 'lib/textutils/sanitizier.rb', line 62

def handle_block_tags( ht )
  @@block_tags.each do |tag|
    ht.gsub!( tag_regex(tag), "\n\1\n" )
  end
  ht
end

#handle_entities(ht) ⇒ Object



34
35
36
37
38
39
# File 'lib/textutils/sanitizier.rb', line 34

def handle_entities( ht )
  ## unescape entities
  #  - check if it also works for generic entities like  etc.
  #  or only for > < etc.
  ht = CGI.unescapeHTML( ht )
end

#handle_ignore_tags(ht) ⇒ Object



47
48
49
50
51
52
# File 'lib/textutils/sanitizier.rb', line 47

def handle_ignore_tags( ht )
  @@ignore_tags.each do |tag|
    ht.gsub!( tag_regex(tag), '' )
  end
  ht
end

#handle_inline_tags(ht) ⇒ Object



54
55
56
57
58
59
60
# File 'lib/textutils/sanitizier.rb', line 54

def handle_inline_tags( ht )
  @@inline_tags.each do |tag|
    # add a space after
    ht.gsub!( tag_regex(tag), '\1 ' )
  end
  ht
end

#tag_regex(tag) ⇒ Object



41
42
43
44
45
# File 'lib/textutils/sanitizier.rb', line 41

def tag_regex( tag )
  # note use non-greedy .*? for content

  /<#{tag}[^>]*>(.*?)<\/#{tag}>/mi
end

#to_plain_textObject



18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# File 'lib/textutils/sanitizier.rb', line 18

def to_plain_text
  
  ht = @ht
  ht = handle_ignore_tags( ht )

## handle_pre_tags ??  - special rule for preformatted (keep whitespace)

  ht = handle_inline_tags( ht )
  ht = handle_block_tags( ht )
  ht = handle_other_tags( ht )  # rules for remain/left over tags

  ht = handle_entities( ht )

  ht
end