Class: Mullet::HTML::PageBuilder

Inherits:
Mullet::HTML::Parser::DefaultHandler show all
Defined in:
lib/mullet/html/page_builder.rb

Overview

Handles SAX events to extract content from an HTML page.

Constant Summary collapse

HEAD =
'head'
TITLE =
'title'
BODY =
'body'
START_CDATA =
'<![CDATA['
END_CDATA =
']]>'
VOID_ELEMENTS =

In HTML5, void elements do not have content and do not have end tags. TODO: Write an alternative implementation where we don’t have to know the void elements when deciding if we should render an end tag.

Set[
'area',
'base',
'br',
'col',
'command',
'embed',
'hr',
'img',
'input',
'keygen',
'link',
'meta',
'param',
'source',
'track',
'wbr']

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods inherited from Mullet::HTML::Parser::DefaultHandler

#doctype

Instance Attribute Details

#pageObject (readonly)

Returns the value of attribute page.



48
49
50
# File 'lib/mullet/html/page_builder.rb', line 48

def page
  @page
end

Instance Method Details

#cdata_block(data) ⇒ Object



111
112
113
114
115
# File 'lib/mullet/html/page_builder.rb', line 111

def cdata_block(data)
  if extracting_inner_html?()
    append(data)
  end
end

#characters(data) ⇒ Object



105
106
107
108
109
# File 'lib/mullet/html/page_builder.rb', line 105

def characters(data)
  if extracting_inner_html?()
    append(data)
  end
end

#comment(data) ⇒ Object



117
118
119
120
121
# File 'lib/mullet/html/page_builder.rb', line 117

def comment(data)
  if extracting_inner_html?()
    append("<!--#{data}-->")
  end
end

#end_element(name) ⇒ Object



82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# File 'lib/mullet/html/page_builder.rb', line 82

def end_element(name)
  if name == HEAD
    @head_count -= 1
  end

  if extracting_inner_html?()
    @inner_depth -= 1
    if extracting_inner_html?() && !VOID_ELEMENTS.include?(name)
      render_end_tag(name)
    end
 
  end

  if @head_count > 0 && name == TITLE
    @page.store(:title, @inner_html)
    return
  end

  if name == BODY
    @page.store(:body, @inner_html)
  end
end

#processing_instruction(data) ⇒ Object



123
124
125
126
127
# File 'lib/mullet/html/page_builder.rb', line 123

def processing_instruction(data)
  if extracting_inner_html?()
    append("<?#{data}?>")
  end
end

#start_documentObject



50
51
52
53
54
55
56
57
58
59
# File 'lib/mullet/html/page_builder.rb', line 50

def start_document()
  # number of open head elements
  @head_count = 0

  # Count of nested open elements where this handler is extracting their
  # inner HTML.
  @inner_depth = 0

  @page = Hash.new()
end

#start_element(name, attributes) ⇒ Object



61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# File 'lib/mullet/html/page_builder.rb', line 61

def start_element(name, attributes)
  if name == HEAD
    @head_count += 1
  end

  if extracting_inner_html?()
    @inner_depth += 1
    render_start_tag(name, attributes)
    return
  end

  if @head_count > 0 && name == TITLE
    start_extracting_inner_html()
    return
  end

  if name == BODY
    start_extracting_inner_html()
  end
end