Class: Boilerpipe::Document::TextBlock

Inherits:
Object
  • Object
show all
Defined in:
lib/boilerpipe/document/text_block.rb

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(text, num_words = 0, num_words_in_anchor_text = 0, num_words_in_wrapped_lines = 0, num_wrapped_lines = 1, offset_blocks = 0) ⇒ TextBlock

Returns a new instance of TextBlock.



12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# File 'lib/boilerpipe/document/text_block.rb', line 12

def initialize(text, num_words = 0, num_words_in_anchor_text = 0, num_words_in_wrapped_lines = 0, num_wrapped_lines = 1, offset_blocks = 0)
  @labels = Set.new
  @text = text
  @num_words = num_words
  @num_words_in_anchor_text = num_words_in_anchor_text
  @num_words_in_wrapped_lines = num_words_in_wrapped_lines
  @num_wrapped_lines = num_wrapped_lines
  @num_full_text_words = 0
  @offset_blocks_start = offset_blocks
  @offset_blocks_end = offset_blocks
  @content = false
  @tag_level = 0

  init_densities
end

Instance Attribute Details

#contentObject

Returns the value of attribute content.



10
11
12
# File 'lib/boilerpipe/document/text_block.rb', line 10

def content
  @content
end

#labelsObject (readonly)

EMPTY_END = TextBlock.new(”, 0, 0, 0, 0, 999999999999999999999999999)



6
7
8
# File 'lib/boilerpipe/document/text_block.rb', line 6

def labels
  @labels
end

EMPTY_END = TextBlock.new(”, 0, 0, 0, 0, 999999999999999999999999999)



6
7
8
# File 'lib/boilerpipe/document/text_block.rb', line 6

def link_density
  @link_density
end

#num_full_text_wordsObject (readonly)

EMPTY_END = TextBlock.new(”, 0, 0, 0, 0, 999999999999999999999999999)



6
7
8
# File 'lib/boilerpipe/document/text_block.rb', line 6

def num_full_text_words
  @num_full_text_words
end

#num_wordsObject (readonly)

EMPTY_END = TextBlock.new(”, 0, 0, 0, 0, 999999999999999999999999999)



6
7
8
# File 'lib/boilerpipe/document/text_block.rb', line 6

def num_words
  @num_words
end

#num_words_in_anchor_textObject (readonly)

EMPTY_END = TextBlock.new(”, 0, 0, 0, 0, 999999999999999999999999999)



6
7
8
# File 'lib/boilerpipe/document/text_block.rb', line 6

def num_words_in_anchor_text
  @num_words_in_anchor_text
end

#num_words_in_wrapped_linesObject (readonly)

EMPTY_END = TextBlock.new(”, 0, 0, 0, 0, 999999999999999999999999999)



6
7
8
# File 'lib/boilerpipe/document/text_block.rb', line 6

def num_words_in_wrapped_lines
  @num_words_in_wrapped_lines
end

#num_wrapped_linesObject (readonly)

EMPTY_END = TextBlock.new(”, 0, 0, 0, 0, 999999999999999999999999999)



6
7
8
# File 'lib/boilerpipe/document/text_block.rb', line 6

def num_wrapped_lines
  @num_wrapped_lines
end

#offset_blocks_endObject (readonly)

EMPTY_END = TextBlock.new(”, 0, 0, 0, 0, 999999999999999999999999999)



6
7
8
# File 'lib/boilerpipe/document/text_block.rb', line 6

def offset_blocks_end
  @offset_blocks_end
end

#offset_blocks_startObject (readonly)

EMPTY_END = TextBlock.new(”, 0, 0, 0, 0, 999999999999999999999999999)



6
7
8
# File 'lib/boilerpipe/document/text_block.rb', line 6

def offset_blocks_start
  @offset_blocks_start
end

#tag_levelObject (readonly)

EMPTY_END = TextBlock.new(”, 0, 0, 0, 0, 999999999999999999999999999)



6
7
8
# File 'lib/boilerpipe/document/text_block.rb', line 6

def tag_level
  @tag_level
end

#textObject (readonly)

EMPTY_END = TextBlock.new(”, 0, 0, 0, 0, 999999999999999999999999999)



6
7
8
# File 'lib/boilerpipe/document/text_block.rb', line 6

def text
  @text
end

#text_densityObject (readonly)

EMPTY_END = TextBlock.new(”, 0, 0, 0, 0, 999999999999999999999999999)



6
7
8
# File 'lib/boilerpipe/document/text_block.rb', line 6

def text_density
  @text_density
end

Class Method Details

.empty_startObject



28
29
30
# File 'lib/boilerpipe/document/text_block.rb', line 28

def self.empty_start
  new('', 0, 0, 0, 0, -1)
end

Instance Method Details

#add_label(label) ⇒ Object



44
45
46
# File 'lib/boilerpipe/document/text_block.rb', line 44

def add_label(label)
  @labels << label
end

#add_labels(labels) ⇒ Object



48
49
50
51
52
# File 'lib/boilerpipe/document/text_block.rb', line 48

def add_labels(labels)
  labels.each do |label|
    add_label(label)
  end
end

#cloneObject



95
96
97
# File 'lib/boilerpipe/document/text_block.rb', line 95

def clone
  throw NotImplementedError
end

#has_label?(label) ⇒ Boolean

Returns:

  • (Boolean)


54
55
56
# File 'lib/boilerpipe/document/text_block.rb', line 54

def has_label?(label)
  @labels.include?(label)
end

#is_content?Boolean

Returns:

  • (Boolean)


36
37
38
# File 'lib/boilerpipe/document/text_block.rb', line 36

def is_content?
  @content
end

#is_not_content?Boolean

Returns:

  • (Boolean)


40
41
42
# File 'lib/boilerpipe/document/text_block.rb', line 40

def is_not_content?
  !is_content?
end

#merge_next(other) ⇒ Object



62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# File 'lib/boilerpipe/document/text_block.rb', line 62

def merge_next(other)
  @text = "#{@text}\n#{other.text}"
  @num_words += other.num_words
  @num_words_in_anchor_text += other.num_words_in_anchor_text
  @num_words_in_wrapped_lines += other.num_words_in_wrapped_lines
  @num_wrapped_lines += other.num_wrapped_lines
  @offset_blocks_start = [@offset_blocks_start, other.offset_blocks_start].min
  @offset_blocks_end = [@offset_blocks_end, other.offset_blocks_end].max
  init_densities
  @content |= other.is_content?

  @num_full_text_words += other.num_full_text_words

  if other.labels
    if @labels.nil?
      @labels = other.labels.clone
    else
      @labels.merge(other.labels.clone)
    end
  end

  @tag_level = [@tag_level, other.tag_level].min
end

#remove_label(label) ⇒ Object



58
59
60
# File 'lib/boilerpipe/document/text_block.rb', line 58

def remove_label(label)
  @labels.delete(label)
end

#set_tag_level(level) ⇒ Object



32
33
34
# File 'lib/boilerpipe/document/text_block.rb', line 32

def set_tag_level(level)
  @tag_level = level
end

#to_sObject



86
87
88
89
90
91
92
93
# File 'lib/boilerpipe/document/text_block.rb', line 86

def to_s
  # "[" + offsetBlocksStart + "-" + offsetBlocksEnd + ";tl=" + tagLevel + "; nw=" + numWords + ";nwl=" + numWrappedLines + ";ld=" + linkDensity + "]\t" + (isContent ? "CONTENT" : "boilerplate") + "," + labels + "\n" + getText();
  labels = 'null'
  if !@labels.empty?
    labels = "[#{@labels.to_a.join(',')}]"
  end
  "[#{@offset_blocks_start}-#{@offset_blocks_end};tl=#{@tag_level}; nw=#{@num_words};nwl=#{@num_wrapped_lines};ld=#{@link_density}]\t#{is_content? ? 'CONTENT' : 'BOILERPLATE'},#{labels}\n#{text}"
end