Class: Boilerpipe::Document::TextBlock

Inherits:
Object
  • Object
show all
Defined in:
lib/boilerpipe/document/text_block.rb

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(text, num_words = 0, num_words_in_anchor_text = 0, num_words_in_wrapped_lines = 0, num_wrapped_lines = 1, offset_blocks = 0) ⇒ TextBlock

Returns a new instance of TextBlock.



15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# File 'lib/boilerpipe/document/text_block.rb', line 15

def initialize(text, num_words=0, num_words_in_anchor_text=0, num_words_in_wrapped_lines=0, num_wrapped_lines=1, offset_blocks=0)
  @labels = Set.new
  @text = text
  @num_words = num_words
  @num_words_in_anchor_text = num_words_in_anchor_text
  @num_words_in_wrapped_lines = num_words_in_wrapped_lines
  @num_wrapped_lines = num_wrapped_lines
  @num_full_text_words = 0
  @offset_blocks_start = offset_blocks
  @offset_blocks_end = offset_blocks
  @content = false
  @tag_level = 0

  init_densities
end

Instance Attribute Details

#contentObject

Returns the value of attribute content.



13
14
15
# File 'lib/boilerpipe/document/text_block.rb', line 13

def content
  @content
end

#labelsObject (readonly)

EMPTY_END = TextBlock.new(”, 0, 0, 0, 0, 999999999999999999999999999)



9
10
11
# File 'lib/boilerpipe/document/text_block.rb', line 9

def labels
  @labels
end

EMPTY_END = TextBlock.new(”, 0, 0, 0, 0, 999999999999999999999999999)



9
10
11
# File 'lib/boilerpipe/document/text_block.rb', line 9

def link_density
  @link_density
end

#num_full_text_wordsObject (readonly)

EMPTY_END = TextBlock.new(”, 0, 0, 0, 0, 999999999999999999999999999)



9
10
11
# File 'lib/boilerpipe/document/text_block.rb', line 9

def num_full_text_words
  @num_full_text_words
end

#num_wordsObject (readonly)

EMPTY_END = TextBlock.new(”, 0, 0, 0, 0, 999999999999999999999999999)



9
10
11
# File 'lib/boilerpipe/document/text_block.rb', line 9

def num_words
  @num_words
end

#num_words_in_anchor_textObject (readonly)

EMPTY_END = TextBlock.new(”, 0, 0, 0, 0, 999999999999999999999999999)



9
10
11
# File 'lib/boilerpipe/document/text_block.rb', line 9

def num_words_in_anchor_text
  @num_words_in_anchor_text
end

#num_words_in_wrapped_linesObject (readonly)

EMPTY_END = TextBlock.new(”, 0, 0, 0, 0, 999999999999999999999999999)



9
10
11
# File 'lib/boilerpipe/document/text_block.rb', line 9

def num_words_in_wrapped_lines
  @num_words_in_wrapped_lines
end

#num_wrapped_linesObject (readonly)

EMPTY_END = TextBlock.new(”, 0, 0, 0, 0, 999999999999999999999999999)



9
10
11
# File 'lib/boilerpipe/document/text_block.rb', line 9

def num_wrapped_lines
  @num_wrapped_lines
end

#offset_blocks_endObject (readonly)

EMPTY_END = TextBlock.new(”, 0, 0, 0, 0, 999999999999999999999999999)



9
10
11
# File 'lib/boilerpipe/document/text_block.rb', line 9

def offset_blocks_end
  @offset_blocks_end
end

#offset_blocks_startObject (readonly)

EMPTY_END = TextBlock.new(”, 0, 0, 0, 0, 999999999999999999999999999)



9
10
11
# File 'lib/boilerpipe/document/text_block.rb', line 9

def offset_blocks_start
  @offset_blocks_start
end

#tag_levelObject (readonly)

EMPTY_END = TextBlock.new(”, 0, 0, 0, 0, 999999999999999999999999999)



9
10
11
# File 'lib/boilerpipe/document/text_block.rb', line 9

def tag_level
  @tag_level
end

#textObject (readonly)

EMPTY_END = TextBlock.new(”, 0, 0, 0, 0, 999999999999999999999999999)



9
10
11
# File 'lib/boilerpipe/document/text_block.rb', line 9

def text
  @text
end

#text_densityObject (readonly)

EMPTY_END = TextBlock.new(”, 0, 0, 0, 0, 999999999999999999999999999)



9
10
11
# File 'lib/boilerpipe/document/text_block.rb', line 9

def text_density
  @text_density
end

Class Method Details

.empty_startObject



31
32
33
# File 'lib/boilerpipe/document/text_block.rb', line 31

def self.empty_start
  new('', 0, 0, 0, 0, -1)
end

Instance Method Details

#add_label(label) ⇒ Object



47
48
49
# File 'lib/boilerpipe/document/text_block.rb', line 47

def add_label(label)
  @labels << label
end

#add_labels(labels) ⇒ Object



51
52
53
54
55
# File 'lib/boilerpipe/document/text_block.rb', line 51

def add_labels(labels)
  labels.each do |label|
    add_label(label)
  end
end

#cloneObject



98
99
100
# File 'lib/boilerpipe/document/text_block.rb', line 98

def clone
  throw NotImplementedError
end

#has_label?(label) ⇒ Boolean

Returns:

  • (Boolean)


57
58
59
# File 'lib/boilerpipe/document/text_block.rb', line 57

def has_label?(label)
  @labels.include?(label)
end

#is_content?Boolean

Returns:

  • (Boolean)


39
40
41
# File 'lib/boilerpipe/document/text_block.rb', line 39

def is_content?
  @content
end

#is_not_content?Boolean

Returns:

  • (Boolean)


43
44
45
# File 'lib/boilerpipe/document/text_block.rb', line 43

def is_not_content?
  !is_content?
end

#merge_next(other) ⇒ Object



65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# File 'lib/boilerpipe/document/text_block.rb', line 65

def merge_next(other)
  @text = "#{@text}\n#{other.text}"
  @num_words += other.num_words
  @num_words_in_anchor_text += other.num_words_in_anchor_text
  @num_words_in_wrapped_lines += other.num_words_in_wrapped_lines
  @num_wrapped_lines += other.num_wrapped_lines
  @offset_blocks_start = [@offset_blocks_start , other.offset_blocks_start].min
  @offset_blocks_end = [@offset_blocks_end , other.offset_blocks_end].max
  init_densities
  @content |= other.is_content?

  @num_full_text_words += other.num_full_text_words

  if other.labels
    if @labels.nil?
      @labels = other.labels.clone
    else
      @labels.merge(other.labels.clone)
    end
  end

  @tag_level = [@tag_level, other.tag_level].min
end

#remove_label(label) ⇒ Object



61
62
63
# File 'lib/boilerpipe/document/text_block.rb', line 61

def remove_label(label)
  @labels.delete(label)
end

#set_tag_level(level) ⇒ Object



35
36
37
# File 'lib/boilerpipe/document/text_block.rb', line 35

def set_tag_level(level)
  @tag_level = level
end

#to_sObject



89
90
91
92
93
94
95
96
# File 'lib/boilerpipe/document/text_block.rb', line 89

def to_s
  #"[" + offsetBlocksStart + "-" + offsetBlocksEnd + ";tl=" + tagLevel + "; nw=" + numWords + ";nwl=" + numWrappedLines + ";ld=" + linkDensity + "]\t" + (isContent ? "CONTENT" : "boilerplate") + "," + labels + "\n" + getText();
  labels = 'null'
  if !@labels.empty?
    labels ="[#{ @labels.to_a.join(',')}]"
  end
  "[#{@offset_blocks_start}-#{@offset_blocks_end};tl=#{@tag_level}; nw=#{@num_words};nwl=#{@num_wrapped_lines};ld=#{@link_density}]\t#{is_content? ? 'CONTENT' : 'BOILERPLATE'},#{labels}\n#{text}"
end