10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
|
# File 'lib/boilerpipe/filters/split_paragraph_blocks_filter.rb', line 10
def self.process(doc)
tbs = doc.text_blocks
new_blocks = []
changes = false
tbs.each do |tb|
paragraphs = tb.text.split(/[\n\r]+/)
if paragraphs.size < 2
new_blocks << tb
next
end
is_content = tb.is_content?
labels = tb.labels
paragraphs.each do |paragraph|
tbP = ::Boilerpipe::Document::TextBlock.new(paragraph)
tbP.content = is_content
tbP.add_labels(labels)
new_blocks << tbP
changes = true
end
end
doc.replace_text_blocks!(new_blocks) if changes
doc
end
|