Class: Boilerpipe::SAX::HTMLContentHandler
- Inherits:
-
Nokogiri::XML::SAX::Document
- Object
- Nokogiri::XML::SAX::Document
- Boilerpipe::SAX::HTMLContentHandler
- Defined in:
- lib/boilerpipe/sax/html_content_handler.rb
Constant Summary collapse
- ANCHOR_TEXT_START =
"$\ue00a<"- ANCHOR_TEXT_END =
">\ue00a$"- VALID_WORD_CHARACTER =
/[\p{L}\p{Nd}\p{Nl}\p{No}]/
Instance Attribute Summary collapse
-
#font_size_stack ⇒ Object
Returns the value of attribute font_size_stack.
-
#in_anchor_tag ⇒ Object
Returns the value of attribute in_anchor_tag.
-
#in_ignorable_element ⇒ Object
readonly
Returns the value of attribute in_ignorable_element.
-
#label_stacks ⇒ Object
readonly
Returns the value of attribute label_stacks.
-
#last_start_tag ⇒ Object
readonly
Returns the value of attribute last_start_tag.
-
#token_buffer ⇒ Object
Returns the value of attribute token_buffer.
Instance Method Summary collapse
- #add_label_action(label_action) ⇒ Object
- #add_text_block(text_block) ⇒ Object
-
#append_space ⇒ Object
append space if last character wasn’t already one.
- #append_text(text) ⇒ Object
- #append_token(token) ⇒ Object
- #characters(text) ⇒ Object
-
#decrease_in_ignorable_element! ⇒ Object
should we prevent less than zero here?.
- #end_element(name) ⇒ Object
- #enter_body_tag! ⇒ Object
- #exit_body_tag! ⇒ Object
- #flush_block ⇒ Object
- #in_anchor_tag? ⇒ Boolean
- #in_ignorable_element? ⇒ Boolean
-
#increase_in_ignorable_element! ⇒ Object
public void flushBlock() { int numWords = 0; int numLinkedWords = 0; int numWrappedLines = 0; int currentLineLength = -1; // don’t count the first space final int maxLineLength = 80; int numTokens = 0; int numWordsCurrentLine = 0; }.
-
#initialize ⇒ HTMLContentHandler
constructor
A new instance of HTMLContentHandler.
-
#is_word?(word) ⇒ Boolean
unicode regex - categories pL – Letter pNd – a decimal digit pNl – a letterlike numeric character pNo – a numeric character of other type.
- #not_in_body_tag? ⇒ Boolean
- #start_element(name, attrs = []) ⇒ Object
- #text_document ⇒ Object
- #token_buffer_size ⇒ Object
Constructor Details
#initialize ⇒ HTMLContentHandler
Returns a new instance of HTMLContentHandler.
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 |
# File 'lib/boilerpipe/sax/html_content_handler.rb', line 9 def initialize @label_stacks = [] @tag_actions = ::Boilerpipe::SAX::TagActionMap.tag_actions @tag_level = 0 @sb_last_was_whitespace = false @text_buffer = '' @token_buffer = '' @offset_blocks = 0 @flush = false @block_tag_level = -1 @in_body_tag = 0 @in_anchor_tag = 0 @in_ignorable_element = 0 @in_anchor_text = false @font_size_stack = [] @last_start_tag = '' @title @text_blocks = [] end |
Instance Attribute Details
#font_size_stack ⇒ Object
Returns the value of attribute font_size_stack.
5 6 7 |
# File 'lib/boilerpipe/sax/html_content_handler.rb', line 5 def font_size_stack @font_size_stack end |
#in_anchor_tag ⇒ Object
Returns the value of attribute in_anchor_tag.
5 6 7 |
# File 'lib/boilerpipe/sax/html_content_handler.rb', line 5 def in_anchor_tag @in_anchor_tag end |
#in_ignorable_element ⇒ Object (readonly)
Returns the value of attribute in_ignorable_element.
3 4 5 |
# File 'lib/boilerpipe/sax/html_content_handler.rb', line 3 def in_ignorable_element @in_ignorable_element end |
#label_stacks ⇒ Object (readonly)
Returns the value of attribute label_stacks.
3 4 5 |
# File 'lib/boilerpipe/sax/html_content_handler.rb', line 3 def label_stacks @label_stacks end |
#last_start_tag ⇒ Object (readonly)
Returns the value of attribute last_start_tag.
3 4 5 |
# File 'lib/boilerpipe/sax/html_content_handler.rb', line 3 def last_start_tag @last_start_tag end |
#token_buffer ⇒ Object
Returns the value of attribute token_buffer.
5 6 7 |
# File 'lib/boilerpipe/sax/html_content_handler.rb', line 5 def token_buffer @token_buffer end |
Instance Method Details
#add_label_action(label_action) ⇒ Object
261 262 263 264 265 266 267 268 269 |
# File 'lib/boilerpipe/sax/html_content_handler.rb', line 261 def add_label_action(label_action) label_stack = @label_stacks.last if label_stack.nil? label_stack = [] @label_stacks.pop @label_stacks << label_stack end label_stack << label_action end |
#add_text_block(text_block) ⇒ Object
230 231 232 233 234 235 236 237 238 239 |
# File 'lib/boilerpipe/sax/html_content_handler.rb', line 230 def add_text_block(text_block) @label_stacks.each do |stack| next unless stack stack.each do |label_action| text_block.add_label(label_action.labels) if label_action end end @text_blocks << text_block end |
#append_space ⇒ Object
append space if last character wasn’t already one
242 243 244 245 246 247 248 249 |
# File 'lib/boilerpipe/sax/html_content_handler.rb', line 242 def append_space return if @sb_last_was_whitespace @sb_last_was_whitespace = true @text_buffer << ' ' @token_buffer << ' ' end |
#append_text(text) ⇒ Object
251 252 253 254 255 |
# File 'lib/boilerpipe/sax/html_content_handler.rb', line 251 def append_text(text) @sb_last_was_whitespace = false @text_buffer << text @token_buffer << text end |
#append_token(token) ⇒ Object
257 258 259 |
# File 'lib/boilerpipe/sax/html_content_handler.rb', line 257 def append_token(token) @token_buffer << token end |
#characters(text) ⇒ Object
47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
# File 'lib/boilerpipe/sax/html_content_handler.rb', line 47 def characters(text) flush_block if @flush return if in_ignorable_element? return if text.empty? # replace all whitespace with simple space text.gsub!(/\s+/, ' ') # trim whitespace started_with_whitespace = text =~ /^\s/ ended_with_whitespace = text =~ /\s$/ text.strip! # add a single space if the block was only whitespace if text.empty? append_space @last_event = :WHITESPACE return end # set block levels @block_tag_level = @tag_level if @block_tag_level == -1 append_space if started_with_whitespace append_text(text) append_space if ended_with_whitespace @last_event = :CHARACTERS end |
#decrease_in_ignorable_element! ⇒ Object
should we prevent less than zero here?
210 211 212 |
# File 'lib/boilerpipe/sax/html_content_handler.rb', line 210 def decrease_in_ignorable_element! @in_ignorable_element -= 1 end |
#end_element(name) ⇒ Object
78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
# File 'lib/boilerpipe/sax/html_content_handler.rb', line 78 def end_element(name) tag = name.upcase.intern tag_action = @tag_actions[tag] if tag_action @flush = tag_action.end_tag(self, name) | @flush else @flush = true end @tag_level -= 1 if tag_action.nil? || tag_action.changes_tag_level? flush_block if @flush @last_event = :END_TAG @last_end_tag = tag @label_stacks.pop end |
#enter_body_tag! ⇒ Object
214 215 216 |
# File 'lib/boilerpipe/sax/html_content_handler.rb', line 214 def enter_body_tag! @in_body_tag += 1 end |
#exit_body_tag! ⇒ Object
218 219 220 |
# File 'lib/boilerpipe/sax/html_content_handler.rb', line 218 def exit_body_tag! @in_body_tag -= 1 end |
#flush_block ⇒ Object
99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 |
# File 'lib/boilerpipe/sax/html_content_handler.rb', line 99 def flush_block @flush = false # set title if not_in_body_tag? @title = @token_buffer.strip if :TITLE == @last_start_tag clear_buffers return end # clear out if empty or just a space length = @token_buffer.size case length when 0 return when 1 clear_buffers if @sb_last_was_whitespace return end num_tokens = 0 num_words = 0 num_words_current_line = 0 num_words_in_wrapped_lines = 0 num_wrapped_lines = 0 num_linked_words = 0 current_line_length = 0 max_line_length = 80 tokens = ::Boilerpipe::UnicodeTokenizer.tokenize(@token_buffer) tokens.each do |token| if ANCHOR_TEXT_START == token @in_anchor_text = true elsif ANCHOR_TEXT_END == token @in_anchor_text = false elsif is_word?(token) num_tokens += 1 num_words += 1 num_words_current_line += 1 num_linked_words += 1 if @in_anchor_text token_length = token.size current_line_length += token_length + 1 if current_line_length > max_line_length num_wrapped_lines += 1 current_line_length = token_length num_words_current_line = 1 end else num_tokens += 1 end end return if num_tokens == 0 num_words_in_wrapped_lines = 0 if num_wrapped_lines == 0 num_words_in_wrapped_lines = num_words num_wrapped_lines = 1 else num_words_in_wrapped_lines = num_words - num_words_current_line end text_block = ::Boilerpipe::Document::TextBlock.new(@text_buffer.strip, num_words, num_linked_words, num_words_in_wrapped_lines, num_wrapped_lines, @offset_blocks) @offset_blocks += 1 clear_buffers text_block.set_tag_level(@block_tag_level) add_text_block(text_block) @block_tag_level = -1 end |
#in_anchor_tag? ⇒ Boolean
226 227 228 |
# File 'lib/boilerpipe/sax/html_content_handler.rb', line 226 def in_anchor_tag? @in_anchor_tag > 0 end |
#in_ignorable_element? ⇒ Boolean
222 223 224 |
# File 'lib/boilerpipe/sax/html_content_handler.rb', line 222 def in_ignorable_element? @in_ignorable_element > 0 end |
#increase_in_ignorable_element! ⇒ Object
public void flushBlock()
int numWords = 0;
int numLinkedWords = 0;
int numWrappedLines = 0;
int currentLineLength = -1; // don't count the first space
final int maxLineLength = 80;
int numTokens = 0;
int numWordsCurrentLine = 0;
205 206 207 |
# File 'lib/boilerpipe/sax/html_content_handler.rb', line 205 def increase_in_ignorable_element! @in_ignorable_element += 1 end |
#is_word?(word) ⇒ Boolean
unicode regex - categories pL – Letter pNd – a decimal digit pNl – a letterlike numeric character pNo – a numeric character of other type
191 192 193 |
# File 'lib/boilerpipe/sax/html_content_handler.rb', line 191 def is_word?(word) word =~ VALID_WORD_CHARACTER end |
#not_in_body_tag? ⇒ Boolean
95 96 97 |
# File 'lib/boilerpipe/sax/html_content_handler.rb', line 95 def not_in_body_tag? @in_body_tag == 0 end |
#start_element(name, attrs = []) ⇒ Object
30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
# File 'lib/boilerpipe/sax/html_content_handler.rb', line 30 def start_element(name, attrs = []) @label_stacks << nil tag = name.upcase.intern tag_action = @tag_actions[tag] if tag_action @tag_level += 1 if tag_action.changes_tag_level? @flush = tag_action.start(self, name, attrs) | @flush else @tag_level += 1 @flush = true end @last_event = :START_TAG @last_start_tag = tag end |
#text_document ⇒ Object
175 176 177 178 |
# File 'lib/boilerpipe/sax/html_content_handler.rb', line 175 def text_document flush_block ::Boilerpipe::Document::TextDocument.new(@title, @text_blocks) end |
#token_buffer_size ⇒ Object
180 181 182 |
# File 'lib/boilerpipe/sax/html_content_handler.rb', line 180 def token_buffer_size @token_buffer.size end |