Class: MList::Util::HtmlTextExtraction

Inherits:
Object
  • Object
show all
Defined in:
lib/mlist/util/email_helpers.rb

Instance Method Summary collapse

Constructor Details

#initialize(html) ⇒ HtmlTextExtraction

Returns a new instance of HtmlTextExtraction.



5
6
7
# File 'lib/mlist/util/email_helpers.rb', line 5

def initialize(html)
  @doc = Hpricot(html)
end

Instance Method Details

#executeObject



9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
# File 'lib/mlist/util/email_helpers.rb', line 9

def execute
  @text, @anchors = '', []
  @doc.each_child do |node|
    extract_text_from_node(node) if Hpricot::Elem::Trav === node
  end
  @text.strip!
  unless @anchors.empty?
    refs = []
    @anchors.each_with_index do |href, i|
      refs << "[#{i+1}] #{href}"
    end
    @text << "\n\n--\n#{refs.join("\n")}"
  end
  @text
end

#extract_text_from_children(elem) ⇒ Object



64
65
66
67
68
69
70
71
72
73
# File 'lib/mlist/util/email_helpers.rb', line 64

def extract_text_from_children(elem)
  elem.each_child do |node|
    case node
    when Hpricot::Text::Trav
      extract_text_from_text_node(node)
    when Hpricot::Elem::Trav
      extract_text_from_node(node)
    end
  end
end

#extract_text_from_node(node) ⇒ Object



25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/mlist/util/email_helpers.rb', line 25

def extract_text_from_node(node)
  case node.name
  when 'head'
  when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
    @text << node.inner_text
    @text << "\n\n"
  when 'br'
    @text << "\n"
  when 'ol'
    node.children_of_type('li').each_with_index do |li, i|
      @text << " #{i+1}. #{li.inner_text}"
      @text << "\n\n"
    end
  when 'ul'
    node.children_of_type('li').each do |li|
      @text << " * #{li.inner_text.strip}"
      @text << "\n\n"
    end
  when 'strong'
    @text << "*#{node.inner_text}*"
  when 'em'
    @text << "_#{node.inner_text}_"
  when 'dl'
    node.traverse_element('dt', 'dd') do |dt_dd|
      extract_text_from_node(dt_dd)
    end
  when 'a'
    @anchors << node['href']
    extract_text_from_text_node(node)
    @text << "[#{@anchors.size}]"
  when 'p', 'dt', 'dd'
    extract_text_from_children(node)
    @text.rstrip!
    @text << "\n\n"
  else
    extract_text_from_children(node)
  end
end

#extract_text_from_text_node(node) ⇒ Object



75
76
77
78
# File 'lib/mlist/util/email_helpers.rb', line 75

def extract_text_from_text_node(node)
  text = @text.end_with?("\n") ? node.inner_text.lstrip : node.inner_text
  @text << text.gsub(/\s{2,}/, ' ').sub(/\n/, '')
end