Class: Boilerpipe::SAX::HTMLContentHandler
- Inherits:
-
Nokogiri::XML::SAX::Document
- Object
- Nokogiri::XML::SAX::Document
- Boilerpipe::SAX::HTMLContentHandler
- Defined in:
- lib/boilerpipe/sax/html_content_handler.rb
Constant Summary collapse
- ANCHOR_TEXT_START =
"$\ue00a<"
- ANCHOR_TEXT_END =
">\ue00a$"
- VALID_WORD_CHARACTER =
/[\p{L}\p{Nd}\p{Nl}\p{No}]/
Instance Attribute Summary collapse
-
#font_size_stack ⇒ Object
Returns the value of attribute font_size_stack.
-
#in_anchor_tag ⇒ Object
Returns the value of attribute in_anchor_tag.
-
#in_ignorable_element ⇒ Object
readonly
Returns the value of attribute in_ignorable_element.
-
#label_stacks ⇒ Object
readonly
Returns the value of attribute label_stacks.
-
#last_start_tag ⇒ Object
readonly
Returns the value of attribute last_start_tag.
-
#token_buffer ⇒ Object
Returns the value of attribute token_buffer.
Instance Method Summary collapse
- #add_label_action(label_action) ⇒ Object
- #add_text_block(text_block) ⇒ Object
-
#append_space ⇒ Object
append space if last character wasn’t already one.
- #append_text(text) ⇒ Object
- #append_token(token) ⇒ Object
- #characters(text) ⇒ Object
- #decrease_in_body! ⇒ Object
-
#decrease_in_ignorable_element! ⇒ Object
should we prevent less than zero here?.
- #end_element(name) ⇒ Object
- #flush_block ⇒ Object
- #in_anchor_tag? ⇒ Boolean
- #in_ignorable_element? ⇒ Boolean
- #increase_in_body! ⇒ Object
-
#increase_in_ignorable_element! ⇒ Object
public void flushBlock() { int numWords = 0; int numLinkedWords = 0; int numWrappedLines = 0; int currentLineLength = -1; // don’t count the first space final int maxLineLength = 80; int numTokens = 0; int numWordsCurrentLine = 0; }.
-
#initialize ⇒ HTMLContentHandler
constructor
A new instance of HTMLContentHandler.
-
#is_word?(word) ⇒ Boolean
unicode regex - categories pL – Letter pNd – a decimal digit pNl – a letterlike numeric character pNo – a numeric character of other type.
- #start_element(name, attrs = []) ⇒ Object
- #text_document ⇒ Object
- #token_buffer_size ⇒ Object
Constructor Details
#initialize ⇒ HTMLContentHandler
Returns a new instance of HTMLContentHandler.
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 |
# File 'lib/boilerpipe/sax/html_content_handler.rb', line 9 def initialize @label_stacks = [] @tag_actions = ::Boilerpipe::SAX::TagActionMap.tag_actions @tag_level = 0 @sb_last_was_whitespace = false @text_buffer = '' @token_buffer = '' @offset_blocks = 0 @flush = false @block_tag_level = -1 @in_body = 0 @in_anchor_tag = 0 @in_ignorable_element = 0 @in_anchor_text = false @font_size_stack = [] @last_start_tag = '' @title @text_blocks = [] end |
Instance Attribute Details
#font_size_stack ⇒ Object
Returns the value of attribute font_size_stack.
5 6 7 |
# File 'lib/boilerpipe/sax/html_content_handler.rb', line 5 def font_size_stack @font_size_stack end |
#in_anchor_tag ⇒ Object
Returns the value of attribute in_anchor_tag.
5 6 7 |
# File 'lib/boilerpipe/sax/html_content_handler.rb', line 5 def in_anchor_tag @in_anchor_tag end |
#in_ignorable_element ⇒ Object (readonly)
Returns the value of attribute in_ignorable_element.
3 4 5 |
# File 'lib/boilerpipe/sax/html_content_handler.rb', line 3 def in_ignorable_element @in_ignorable_element end |
#label_stacks ⇒ Object (readonly)
Returns the value of attribute label_stacks.
3 4 5 |
# File 'lib/boilerpipe/sax/html_content_handler.rb', line 3 def label_stacks @label_stacks end |
#last_start_tag ⇒ Object (readonly)
Returns the value of attribute last_start_tag.
3 4 5 |
# File 'lib/boilerpipe/sax/html_content_handler.rb', line 3 def last_start_tag @last_start_tag end |
#token_buffer ⇒ Object
Returns the value of attribute token_buffer.
5 6 7 |
# File 'lib/boilerpipe/sax/html_content_handler.rb', line 5 def token_buffer @token_buffer end |
Instance Method Details
#add_label_action(label_action) ⇒ Object
255 256 257 258 259 260 261 262 263 |
# File 'lib/boilerpipe/sax/html_content_handler.rb', line 255 def add_label_action(label_action) label_stack = @label_stacks.last if label_stack.nil? label_stack = [] @label_stacks.pop @label_stacks << label_stack end label_stack << label_action end |
#add_text_block(text_block) ⇒ Object
224 225 226 227 228 229 230 231 232 233 |
# File 'lib/boilerpipe/sax/html_content_handler.rb', line 224 def add_text_block(text_block) @label_stacks.each do |stack| next unless stack stack.each do |label_action| text_block.add_label(label_action.labels) if label_action end end @text_blocks << text_block end |
#append_space ⇒ Object
append space if last character wasn’t already one
236 237 238 239 240 241 242 243 |
# File 'lib/boilerpipe/sax/html_content_handler.rb', line 236 def append_space return if @sb_last_was_whitespace @sb_last_was_whitespace = true @text_buffer << ' ' @token_buffer << ' ' end |
#append_text(text) ⇒ Object
245 246 247 248 249 |
# File 'lib/boilerpipe/sax/html_content_handler.rb', line 245 def append_text(text) @sb_last_was_whitespace = false @text_buffer << text @token_buffer << text end |
#append_token(token) ⇒ Object
251 252 253 |
# File 'lib/boilerpipe/sax/html_content_handler.rb', line 251 def append_token(token) @token_buffer << token end |
#characters(text) ⇒ Object
47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
# File 'lib/boilerpipe/sax/html_content_handler.rb', line 47 def characters(text) flush_block if @flush return if in_ignorable_element? return if text.empty? # replace all whitespace with simple space text.gsub!(/\s+/, ' ') # trim whitespace started_with_whitespace = text =~ /^\s/ ended_with_whitespace = text =~ /\s$/ text.strip! # add a single space if the block was only whitespace if text.empty? append_space @last_event = :WHITESPACE return end # set block levels @block_tag_level = @tag_level if @block_tag_level == -1 append_space if started_with_whitespace append_text(text) append_space if ended_with_whitespace @last_event = :CHARACTERS end |
#decrease_in_body! ⇒ Object
212 213 214 |
# File 'lib/boilerpipe/sax/html_content_handler.rb', line 212 def decrease_in_body! @in_body -= 1 end |
#decrease_in_ignorable_element! ⇒ Object
should we prevent less than zero here?
204 205 206 |
# File 'lib/boilerpipe/sax/html_content_handler.rb', line 204 def decrease_in_ignorable_element! @in_ignorable_element -= 1 end |
#end_element(name) ⇒ Object
78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
# File 'lib/boilerpipe/sax/html_content_handler.rb', line 78 def end_element(name) tag = name.upcase.intern tag_action = @tag_actions[tag] if tag_action @flush = tag_action.end_tag(self, name) | @flush else @flush = true end @tag_level -= 1 if tag_action.nil? || tag_action.changes_tag_level? flush_block if @flush @last_event = :END_TAG @last_end_tag = tag @label_stacks.pop end |
#flush_block ⇒ Object
95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
# File 'lib/boilerpipe/sax/html_content_handler.rb', line 95 def flush_block @flush = false if @in_body == 0 @title = @token_buffer.strip if :TITLE == @last_start_tag clear_buffers return end # clear out if empty or just a space length = @token_buffer.size case length when 0 return when 1 clear_buffers if @sb_last_was_whitespace return end num_tokens = 0 num_words = 0 num_words_current_line = 0 num_words_in_wrapped_lines = 0 num_wrapped_lines = 0 num_linked_words = 0 current_line_length = 0 max_line_length = 80 tokens = ::Boilerpipe::UnicodeTokenizer.tokenize(@token_buffer) tokens.each do |token| if ANCHOR_TEXT_START == token @in_anchor_text = true elsif ANCHOR_TEXT_END == token @in_anchor_text = false elsif is_word?(token) num_tokens += 1 num_words += 1 num_words_current_line += 1 num_linked_words += 1 if @in_anchor_text token_length = token.size current_line_length += token_length + 1 if current_line_length > max_line_length num_wrapped_lines += 1 current_line_length = token_length num_words_current_line = 1 end else num_tokens += 1 end end return if num_tokens == 0 num_words_in_wrapped_lines = 0 if num_wrapped_lines == 0 num_words_in_wrapped_lines = num_words num_wrapped_lines = 1 else num_words_in_wrapped_lines = num_words - num_words_current_line end text_block = ::Boilerpipe::Document::TextBlock.new(@text_buffer.strip, num_words, num_linked_words, num_words_in_wrapped_lines, num_wrapped_lines, @offset_blocks) @offset_blocks += 1 clear_buffers text_block.set_tag_level(@block_tag_level) add_text_block(text_block) @block_tag_level = -1 end |
#in_anchor_tag? ⇒ Boolean
220 221 222 |
# File 'lib/boilerpipe/sax/html_content_handler.rb', line 220 def in_anchor_tag? @in_anchor_tag > 0 end |
#in_ignorable_element? ⇒ Boolean
216 217 218 |
# File 'lib/boilerpipe/sax/html_content_handler.rb', line 216 def in_ignorable_element? @in_ignorable_element > 0 end |
#increase_in_body! ⇒ Object
208 209 210 |
# File 'lib/boilerpipe/sax/html_content_handler.rb', line 208 def increase_in_body! @in_body += 1 end |
#increase_in_ignorable_element! ⇒ Object
public void flushBlock()
int numWords = 0;
int numLinkedWords = 0;
int numWrappedLines = 0;
int currentLineLength = -1; // don't count the first space
final int maxLineLength = 80;
int numTokens = 0;
int numWordsCurrentLine = 0;
199 200 201 |
# File 'lib/boilerpipe/sax/html_content_handler.rb', line 199 def increase_in_ignorable_element! @in_ignorable_element += 1 end |
#is_word?(word) ⇒ Boolean
unicode regex - categories pL – Letter pNd – a decimal digit pNl – a letterlike numeric character pNo – a numeric character of other type
185 186 187 |
# File 'lib/boilerpipe/sax/html_content_handler.rb', line 185 def is_word?(word) word =~ VALID_WORD_CHARACTER end |
#start_element(name, attrs = []) ⇒ Object
30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
# File 'lib/boilerpipe/sax/html_content_handler.rb', line 30 def start_element(name, attrs = []) @label_stacks << nil tag = name.upcase.intern tag_action = @tag_actions[tag] if tag_action @tag_level += 1 if tag_action.changes_tag_level? @flush = tag_action.start(self, name, attrs) | @flush else @tag_level += 1 @flush = true end @last_event = :START_TAG @last_start_tag = tag end |
#text_document ⇒ Object
169 170 171 172 |
# File 'lib/boilerpipe/sax/html_content_handler.rb', line 169 def text_document flush_block ::Boilerpipe::Document::TextDocument.new(@title, @text_blocks) end |
#token_buffer_size ⇒ Object
174 175 176 |
# File 'lib/boilerpipe/sax/html_content_handler.rb', line 174 def token_buffer_size @token_buffer.size end |