Class: Boilerpipe::Document::TextBlock
- Inherits:
-
Object
- Object
- Boilerpipe::Document::TextBlock
- Defined in:
- lib/boilerpipe/document/text_block.rb
Instance Attribute Summary collapse
-
#content ⇒ Object
Returns the value of attribute content.
-
#labels ⇒ Object
readonly
EMPTY_END = TextBlock.new(”, 0, 0, 0, 0, 999999999999999999999999999).
-
#link_density ⇒ Object
readonly
EMPTY_END = TextBlock.new(”, 0, 0, 0, 0, 999999999999999999999999999).
-
#num_full_text_words ⇒ Object
readonly
EMPTY_END = TextBlock.new(”, 0, 0, 0, 0, 999999999999999999999999999).
-
#num_words ⇒ Object
readonly
EMPTY_END = TextBlock.new(”, 0, 0, 0, 0, 999999999999999999999999999).
-
#num_words_in_anchor_text ⇒ Object
readonly
EMPTY_END = TextBlock.new(”, 0, 0, 0, 0, 999999999999999999999999999).
-
#num_words_in_wrapped_lines ⇒ Object
readonly
EMPTY_END = TextBlock.new(”, 0, 0, 0, 0, 999999999999999999999999999).
-
#num_wrapped_lines ⇒ Object
readonly
EMPTY_END = TextBlock.new(”, 0, 0, 0, 0, 999999999999999999999999999).
-
#offset_blocks_end ⇒ Object
readonly
EMPTY_END = TextBlock.new(”, 0, 0, 0, 0, 999999999999999999999999999).
-
#offset_blocks_start ⇒ Object
readonly
EMPTY_END = TextBlock.new(”, 0, 0, 0, 0, 999999999999999999999999999).
-
#tag_level ⇒ Object
readonly
EMPTY_END = TextBlock.new(”, 0, 0, 0, 0, 999999999999999999999999999).
-
#text ⇒ Object
readonly
EMPTY_END = TextBlock.new(”, 0, 0, 0, 0, 999999999999999999999999999).
-
#text_density ⇒ Object
readonly
EMPTY_END = TextBlock.new(”, 0, 0, 0, 0, 999999999999999999999999999).
Class Method Summary collapse
Instance Method Summary collapse
- #add_label(label) ⇒ Object
- #add_labels(labels) ⇒ Object
- #clone ⇒ Object
- #has_label?(label) ⇒ Boolean
-
#initialize(text, num_words = 0, num_words_in_anchor_text = 0, num_words_in_wrapped_lines = 0, num_wrapped_lines = 1, offset_blocks = 0) ⇒ TextBlock
constructor
A new instance of TextBlock.
- #is_content? ⇒ Boolean
- #is_not_content? ⇒ Boolean
- #merge_next(other) ⇒ Object
- #remove_label(label) ⇒ Object
- #set_tag_level(level) ⇒ Object
- #to_s ⇒ Object
Constructor Details
#initialize(text, num_words = 0, num_words_in_anchor_text = 0, num_words_in_wrapped_lines = 0, num_wrapped_lines = 1, offset_blocks = 0) ⇒ TextBlock
Returns a new instance of TextBlock.
12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 |
# File 'lib/boilerpipe/document/text_block.rb', line 12 def initialize(text, num_words = 0, num_words_in_anchor_text = 0, num_words_in_wrapped_lines = 0, num_wrapped_lines = 1, offset_blocks = 0) @labels = Set.new @text = text @num_words = num_words @num_words_in_anchor_text = num_words_in_anchor_text @num_words_in_wrapped_lines = num_words_in_wrapped_lines @num_wrapped_lines = num_wrapped_lines @num_full_text_words = 0 @offset_blocks_start = offset_blocks @offset_blocks_end = offset_blocks @content = false @tag_level = 0 init_densities end |
Instance Attribute Details
#content ⇒ Object
Returns the value of attribute content.
10 11 12 |
# File 'lib/boilerpipe/document/text_block.rb', line 10 def content @content end |
#labels ⇒ Object (readonly)
EMPTY_END = TextBlock.new(”, 0, 0, 0, 0, 999999999999999999999999999)
6 7 8 |
# File 'lib/boilerpipe/document/text_block.rb', line 6 def labels @labels end |
#link_density ⇒ Object (readonly)
EMPTY_END = TextBlock.new(”, 0, 0, 0, 0, 999999999999999999999999999)
6 7 8 |
# File 'lib/boilerpipe/document/text_block.rb', line 6 def link_density @link_density end |
#num_full_text_words ⇒ Object (readonly)
EMPTY_END = TextBlock.new(”, 0, 0, 0, 0, 999999999999999999999999999)
6 7 8 |
# File 'lib/boilerpipe/document/text_block.rb', line 6 def num_full_text_words @num_full_text_words end |
#num_words ⇒ Object (readonly)
EMPTY_END = TextBlock.new(”, 0, 0, 0, 0, 999999999999999999999999999)
6 7 8 |
# File 'lib/boilerpipe/document/text_block.rb', line 6 def num_words @num_words end |
#num_words_in_anchor_text ⇒ Object (readonly)
EMPTY_END = TextBlock.new(”, 0, 0, 0, 0, 999999999999999999999999999)
6 7 8 |
# File 'lib/boilerpipe/document/text_block.rb', line 6 def num_words_in_anchor_text @num_words_in_anchor_text end |
#num_words_in_wrapped_lines ⇒ Object (readonly)
EMPTY_END = TextBlock.new(”, 0, 0, 0, 0, 999999999999999999999999999)
6 7 8 |
# File 'lib/boilerpipe/document/text_block.rb', line 6 def num_words_in_wrapped_lines @num_words_in_wrapped_lines end |
#num_wrapped_lines ⇒ Object (readonly)
EMPTY_END = TextBlock.new(”, 0, 0, 0, 0, 999999999999999999999999999)
6 7 8 |
# File 'lib/boilerpipe/document/text_block.rb', line 6 def num_wrapped_lines @num_wrapped_lines end |
#offset_blocks_end ⇒ Object (readonly)
EMPTY_END = TextBlock.new(”, 0, 0, 0, 0, 999999999999999999999999999)
6 7 8 |
# File 'lib/boilerpipe/document/text_block.rb', line 6 def offset_blocks_end @offset_blocks_end end |
#offset_blocks_start ⇒ Object (readonly)
EMPTY_END = TextBlock.new(”, 0, 0, 0, 0, 999999999999999999999999999)
6 7 8 |
# File 'lib/boilerpipe/document/text_block.rb', line 6 def offset_blocks_start @offset_blocks_start end |
#tag_level ⇒ Object (readonly)
EMPTY_END = TextBlock.new(”, 0, 0, 0, 0, 999999999999999999999999999)
6 7 8 |
# File 'lib/boilerpipe/document/text_block.rb', line 6 def tag_level @tag_level end |
#text ⇒ Object (readonly)
EMPTY_END = TextBlock.new(”, 0, 0, 0, 0, 999999999999999999999999999)
6 7 8 |
# File 'lib/boilerpipe/document/text_block.rb', line 6 def text @text end |
#text_density ⇒ Object (readonly)
EMPTY_END = TextBlock.new(”, 0, 0, 0, 0, 999999999999999999999999999)
6 7 8 |
# File 'lib/boilerpipe/document/text_block.rb', line 6 def text_density @text_density end |
Class Method Details
.empty_start ⇒ Object
28 29 30 |
# File 'lib/boilerpipe/document/text_block.rb', line 28 def self.empty_start new('', 0, 0, 0, 0, -1) end |
Instance Method Details
#add_label(label) ⇒ Object
44 45 46 |
# File 'lib/boilerpipe/document/text_block.rb', line 44 def add_label(label) @labels << label end |
#add_labels(labels) ⇒ Object
48 49 50 51 52 |
# File 'lib/boilerpipe/document/text_block.rb', line 48 def add_labels(labels) labels.each do |label| add_label(label) end end |
#clone ⇒ Object
95 96 97 |
# File 'lib/boilerpipe/document/text_block.rb', line 95 def clone throw NotImplementedError end |
#has_label?(label) ⇒ Boolean
54 55 56 |
# File 'lib/boilerpipe/document/text_block.rb', line 54 def has_label?(label) @labels.include?(label) end |
#is_content? ⇒ Boolean
36 37 38 |
# File 'lib/boilerpipe/document/text_block.rb', line 36 def is_content? @content end |
#is_not_content? ⇒ Boolean
40 41 42 |
# File 'lib/boilerpipe/document/text_block.rb', line 40 def is_not_content? !is_content? end |
#merge_next(other) ⇒ Object
62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
# File 'lib/boilerpipe/document/text_block.rb', line 62 def merge_next(other) @text = "#{@text}\n#{other.text}" @num_words += other.num_words @num_words_in_anchor_text += other.num_words_in_anchor_text @num_words_in_wrapped_lines += other.num_words_in_wrapped_lines @num_wrapped_lines += other.num_wrapped_lines @offset_blocks_start = [@offset_blocks_start, other.offset_blocks_start].min @offset_blocks_end = [@offset_blocks_end, other.offset_blocks_end].max init_densities @content |= other.is_content? @num_full_text_words += other.num_full_text_words if other.labels if @labels.nil? @labels = other.labels.clone else @labels.merge(other.labels.clone) end end @tag_level = [@tag_level, other.tag_level].min end |
#remove_label(label) ⇒ Object
58 59 60 |
# File 'lib/boilerpipe/document/text_block.rb', line 58 def remove_label(label) @labels.delete(label) end |
#set_tag_level(level) ⇒ Object
32 33 34 |
# File 'lib/boilerpipe/document/text_block.rb', line 32 def set_tag_level(level) @tag_level = level end |
#to_s ⇒ Object
86 87 88 89 90 91 92 93 |
# File 'lib/boilerpipe/document/text_block.rb', line 86 def to_s # "[" + offsetBlocksStart + "-" + offsetBlocksEnd + ";tl=" + tagLevel + "; nw=" + numWords + ";nwl=" + numWrappedLines + ";ld=" + linkDensity + "]\t" + (isContent ? "CONTENT" : "boilerplate") + "," + labels + "\n" + getText(); labels = 'null' if !@labels.empty? labels = "[#{@labels.to_a.join(',')}]" end "[#{@offset_blocks_start}-#{@offset_blocks_end};tl=#{@tag_level}; nw=#{@num_words};nwl=#{@num_wrapped_lines};ld=#{@link_density}]\t#{is_content? ? 'CONTENT' : 'BOILERPLATE'},#{labels}\n#{text}" end |