Class: HTML2TextParser

Inherits:
SGMLParser show all
Defined in:
lib/feed2imap/html2text-parser.rb

Overview

this class provides a simple SGML parser that removes HTML tags

Constant Summary

Constants inherited from SGMLParser

SGMLParser::Attrfind, SGMLParser::Charref, SGMLParser::Commentclose, SGMLParser::Commentopen, SGMLParser::Endbracket, SGMLParser::Endtagopen, SGMLParser::Entitydefs, SGMLParser::Entityref, SGMLParser::Incomplete, SGMLParser::Interesting, SGMLParser::Special, SGMLParser::Starttagopen, SGMLParser::Tagfind

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods inherited from SGMLParser

#feed, #finish_endtag, #finish_starttag, #goahead, #handle_charref, #handle_comment, #handle_endtag, #handle_entityref, #handle_special, #handle_starttag, #has_context, #parse_comment, #parse_endtag, #parse_special, #parse_starttag, #report_unbalanced, #reset, #setliteral, #setnomoretags, #unknown_charref, #unknown_entityref

Constructor Details

#initialize(verbose = false) ⇒ HTML2TextParser

Returns a new instance of HTML2TextParser.



27
28
29
30
31
32
33
# File 'lib/feed2imap/html2text-parser.rb', line 27

def initialize(verbose = false)
  @savedata = ''
  @pre = false
  @href = nil
  @links = []
  super(verbose)
end

Instance Attribute Details

#savedataObject (readonly)

Returns the value of attribute savedata.



25
26
27
# File 'lib/feed2imap/html2text-parser.rb', line 25

def savedata
  @savedata
end

Instance Method Details

#closeObject



71
72
73
74
75
76
77
78
79
# File 'lib/feed2imap/html2text-parser.rb', line 71

def close
  super
  if @links.length > 0
    @savedata << "\n\n"
    @links.each_index do |i|
      @savedata << "[#{i+1}] #{@links[i]}\n"
    end
  end
end

#handle_data(data) ⇒ Object



35
36
37
38
39
40
# File 'lib/feed2imap/html2text-parser.rb', line 35

def handle_data(data)
  # let's remove all CR
  data.gsub!(/\n/, '') if not @pre
 
  @savedata << data
end

#unknown_endtag(tag) ⇒ Object



81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# File 'lib/feed2imap/html2text-parser.rb', line 81

def unknown_endtag(tag)
  case tag
  when 'b'
    @savedata << '*'
  when 'u'
    @savedata << '_'
  when 'i'
    @savedata << '/'
  when 'pre'
    @savedata << "\n\n"
    @pre = false
  when 'a'
    if @href
      @savedata << "[#{@links.length}]"
      @href = nil
    end
  end
end

#unknown_starttag(tag, attrs) ⇒ Object



42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# File 'lib/feed2imap/html2text-parser.rb', line 42

def unknown_starttag(tag, attrs)
  case tag
  when 'p'
    @savedata << "\n\n"
  when 'br'
    @savedata << "\n"
  when 'b'
    @savedata << '*'
  when 'u'
    @savedata << '_'
  when 'i'
    @savedata << '/'
  when 'pre'
    @savedata << "\n\n"
    @pre = true
  when 'a'
    # find href in args
    @href = nil
    attrs.each do |a|
      if a[0] == 'href'
        @href = a[1]
      end
    end
    if @href
      @links << @href.gsub(/^("|'|)(.*)("|')$/,'\2')
    end
  end
end